diff --git a/.github/workflows/pose_vis.yml b/.github/workflows/pose_vis.yml new file mode 100644 index 00000000..96229f9e --- /dev/null +++ b/.github/workflows/pose_vis.yml @@ -0,0 +1,21 @@ +on: [push] +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + steps: + - uses: actions/checkout@v2 + - name: Python setup ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install PoseVis + run: | + cd devices/webcam + python setup.py install + - name: Run PoseVis tests + run: | + cd devices/webcam + python -m unittest pose_vis/test/tests.py \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..c7e7da8e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "buck.projectConfig": { + "projectPath": "c:\\Users\\das\\Desktop\\labgraph\\manylinux.buckconfig", + "flavor": [], + "target": "", + "platform": "linux-x86_64" + } +} \ No newline at end of file diff --git a/devices/webcam/__init__.py b/devices/webcam/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/benchmark.ipynb b/devices/webcam/benchmark.ipynb new file mode 100644 index 00000000..91047a83 --- /dev/null +++ b/devices/webcam/benchmark.ipynb @@ -0,0 +1,361 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5a180c32", + "metadata": {}, + "source": [ + "## PoseVis Data Quality\n", + "\n", + "Runs a series of benchmarks and reports data quality metrics such as dropped frame percentage, latency, and jitter." + ] + }, + { + "cell_type": "markdown", + "id": "d73eadd6", + "metadata": {}, + "source": [ + "### Install py-cpuinfo for System Info" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "470df640", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting py-cpuinfo\n", + " Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n", + "Installing collected packages: py-cpuinfo\n", + "Successfully installed py-cpuinfo-9.0.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip available: 22.2.2 -> 22.3\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install py-cpuinfo" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c966bc06", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python version: 3.10.8.final.0 (64 bit)\n", + "CPU: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz\n" + ] + } + ], + "source": [ + "import cpuinfo\n", + "\n", + "info = cpuinfo.get_cpu_info()\n", + "print(f\"Python version: {info['python_version']}\")\n", + "print(f\"CPU: {info['brand_raw']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2e33b67a", + "metadata": {}, + "source": [ + "### Run Benchmarks\n", + "\n", + "Devices tested are: (0) Logitech C270 Webcam, and (1) VUPUMER Webcam" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "555aadb1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:pose_vis.runner: building graph\n", + "INFO:pose_vis.runner: logging directory is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runners.benchmark_runner: benchmark output path is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runner: running graph\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Running benchmark: benchmark_1_sources_1280x720x30\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:pose_vis.runner: building graph\n", + "INFO:pose_vis.runner: enabling extension: HandsExtension\n", + "INFO:pose_vis.runner: logging directory is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runners.benchmark_runner: benchmark output path is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runner: running graph\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Benchmark complete: benchmark_1_sources_1280x720x30\n", + "### Running benchmark: benchmark_1_sources_1280x720x30_HandsExtension\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:pose_vis.runner: building graph\n", + "INFO:pose_vis.runner: logging directory is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runners.benchmark_runner: benchmark output path is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runner: running graph\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Benchmark complete: benchmark_1_sources_1280x720x30_HandsExtension\n", + "### Running benchmark: benchmark_2_sources_1280x720x30\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:pose_vis.runner: building graph\n", + "INFO:pose_vis.runner: enabling extension: HandsExtension\n", + "INFO:pose_vis.runner: logging directory is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runners.benchmark_runner: benchmark output path is c:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runner: running graph\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Benchmark complete: benchmark_2_sources_1280x720x30\n", + "### Running benchmark: benchmark_2_sources_1280x720x30_HandsExtension\n", + "### Benchmark complete: benchmark_2_sources_1280x720x30_HandsExtension\n" + ] + } + ], + "source": [ + "import os\n", + "import pose_vis.pose_vis\n", + "\n", + "from dataclasses import dataclass\n", + "from typing import List, Tuple\n", + "from pose_vis.extension import PoseVisExtension\n", + "from pose_vis.runner import PoseVisConfig\n", + "from pose_vis.runners.benchmark_runner import BenchmarkRunner, BenchmarkRunnerConfig\n", + "from pose_vis.extensions.hands import HandsExtension\n", + "\n", + "@dataclass\n", + "class BenchmarkConfig():\n", + " extensions: List[PoseVisExtension]\n", + " logging: bool\n", + " sources: List[int]\n", + " resolution: Tuple[int, int, int]\n", + " runtime: int\n", + "\n", + "benchmarks = [\n", + " # Single camera, 1280x720x30\n", + " BenchmarkConfig([], False, [0], (1280, 720, 30), 60),\n", + " # Single camera, 1280x720x30, hand tracking\n", + " BenchmarkConfig([HandsExtension()], False, [0], (1280, 720, 30), 60),\n", + " # Two cameras, 1280x720x30\n", + " BenchmarkConfig([], False, [0, 1], (1280, 720, 30), 60),\n", + " # Two cameras, 1280x720x30, hand tracking\n", + " BenchmarkConfig([HandsExtension()], False, [0, 1], (1280, 720, 30), 60),\n", + "]\n", + "\n", + "for benchmark in benchmarks:\n", + " config = PoseVisConfig(\n", + " extensions = benchmark.extensions,\n", + " log_directory = f\"webcam{os.sep}logs\",\n", + " log_name = \"benchmark\",\n", + " enable_logging = benchmark.logging,\n", + " display_framerate = 0,\n", + " stats_history_size = 0)\n", + "\n", + " resolutions = [benchmark.resolution for _ in range(len(benchmark.sources))]\n", + " output_name = f\"benchmark_{len(benchmark.sources)}_sources_{resolutions[0][0]}x{resolutions[0][1]}x{resolutions[0][2]}\"\n", + "\n", + " if len(benchmark.extensions) > 0:\n", + " ext_names = \"\"\n", + " for i in range(len(benchmark.extensions)):\n", + " sep = \"_\" if i > 0 else \"\"\n", + " ext_names += f\"{sep}{benchmark.extensions[i].__class__.__name__}\"\n", + " output_name = f\"{output_name}_{ext_names}\"\n", + " if benchmark.logging:\n", + " output_name = f\"{output_name}_logging\"\n", + " \n", + " runner_config = BenchmarkRunnerConfig(\n", + " sources = benchmark.sources,\n", + " resolutions = resolutions,\n", + " output_path = f\"webcam{os.sep}logs\",\n", + " output_name = output_name,\n", + " run_time = 60)\n", + " runner = BenchmarkRunner(config, runner_config)\n", + "\n", + " print(f\"### Running benchmark: {output_name}\")\n", + " runner.build()\n", + " runner.run()\n", + " print(f\"### Benchmark complete: {output_name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3ff8a0ea", + "metadata": {}, + "source": [ + "### Display Results" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dc52cb95", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Benchmark: benchmark_1_sources_1280x720x30\n", + "# runtime: 60.55s\n", + "# dropped: 0.75%\n", + "# latency: 16.69ms\n", + "# jitter: 5.04ms\n", + "# [desync]\n", + " \n", + "## Benchmark: benchmark_1_sources_1280x720x30_HandsExtension\n", + "# runtime: 60.45s\n", + "# dropped: 0.64%\n", + "# latency: 5.17ms\n", + "# jitter: 4.14ms\n", + "# [desync]\n", + " \n", + "## Benchmark: benchmark_2_sources_1280x720x30\n", + "# runtime: 60.42s\n", + "# dropped: 0.64%\n", + "# latency: 13.33ms\n", + "# jitter: 11.09ms\n", + "# [desync]\n", + "# from source 0 to source 1: 2.43ms, jitter: 11.02ms\n", + " \n", + "## Benchmark: benchmark_2_sources_1280x720x30_HandsExtension\n", + "# runtime: 60.46s\n", + "# dropped: 28.10%\n", + "# latency: 4.94ms\n", + "# jitter: 5.80ms\n", + "# [desync]\n", + "# from source 0 to source 1: 0.11ms, jitter: 6.10ms\n", + " \n" + ] + } + ], + "source": [ + "import json\n", + "import statistics\n", + "from pose_vis.utils import absolute_path\n", + "from typing import Dict, Union, List, Tuple\n", + "\n", + "json_files = [_file for _file in os.listdir(absolute_path(f\"webcam{os.sep}logs\")) if _file.endswith(\".json\")]\n", + "\n", + "for filename in json_files:\n", + " timings: Dict[str, Union[float, int, List[Tuple[float, float, float]]]] = {}\n", + "\n", + " with open(absolute_path(f\"webcam{os.sep}logs{os.sep}{filename}\")) as _file:\n", + " timings = json.loads(_file.read())\n", + "\n", + " print(f\"## Benchmark: {filename.removesuffix('.json')}\")\n", + " print(f\"# runtime: {timings['runtime']:.2f}s\")\n", + "\n", + " expected_frames = timings[\"runtime\"] * timings[\"target_fps\"]\n", + " # Estimate of how many frames were dropped while capturing the source\n", + " source_received_frames_pct = timings[\"frame_index\"] / expected_frames\n", + " print(f\"# dropped: {(100 - (source_received_frames_pct * 100)):.2f}%\")\n", + "\n", + " latency: List[float] = []\n", + " desync: List[List[float]] = []\n", + " # device time = system time at frame capture from device\n", + " # All times are captured with time.perf_counter()\n", + " # times: List[(device time, receive time)]\n", + " for i in range(1, len(timings[\"times\"])):\n", + " rel_device = timings[\"times\"][i][0] - timings[\"times\"][0][0]\n", + " rel_receive = timings[\"times\"][i][1] - timings[\"times\"][0][1]\n", + " # TODO: this may not be correct\n", + " # `rel_device` can be greater than `rel_receive`, for now we just take the absolute value\n", + " latency.append(abs(rel_receive - rel_device))\n", + "\n", + " if len(timings[\"sync\"]) > 0:\n", + " if i == 1:\n", + " desync = [[] for j in range(len(timings[\"sync\"][i]))]\n", + "\n", + " for j in range(len(timings[\"sync\"][i])):\n", + " desync[j].append(abs(timings[\"times\"][i][0] - timings[\"sync\"][i][j]))\n", + " \n", + " print(f\"# latency: {statistics.median(latency) * 1000:.2f}ms\")\n", + " print(f\"# jitter: {statistics.stdev(latency) * 1000:.2f}ms\")\n", + " print(\"# [desync]\")\n", + " for i, li in enumerate(desync):\n", + " print(f\"# from source 0 to source {i + 1}: {statistics.median(desync[i]) * 1000:.2f}ms, jitter: {statistics.stdev(desync[i]) * 1000:.2f}ms\")\n", + " print(\" \")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "vscode": { + "interpreter": { + "hash": "23a593576959775a19d6469cad78770ba03ee1b7699646fbac2d14539ad9dcf0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/devices/webcam/images/pexels-min-an-1454797.jpg b/devices/webcam/images/pexels-min-an-1454797.jpg new file mode 100644 index 00000000..f126650f Binary files /dev/null and b/devices/webcam/images/pexels-min-an-1454797.jpg differ diff --git a/devices/webcam/images/pexels-min-an-1454797.md b/devices/webcam/images/pexels-min-an-1454797.md new file mode 100644 index 00000000..d5dc1f7c --- /dev/null +++ b/devices/webcam/images/pexels-min-an-1454797.md @@ -0,0 +1 @@ +This file was obtained from [pexels.com](https://www.pexels.com/photo/person-s-hand-in-shallow-photo-1454797/). It is marked as [free to use](https://www.pexels.com/license/). \ No newline at end of file diff --git a/devices/webcam/linux_gstreamer.md b/devices/webcam/linux_gstreamer.md new file mode 100644 index 00000000..c4d0abb2 --- /dev/null +++ b/devices/webcam/linux_gstreamer.md @@ -0,0 +1,96 @@ +# Linux GStreamer Guide + +In this guide we will install GStreamer and build OpenCV with GStreamer support. This guide uses a fresh install of Ubuntu 20.04. + +## Build OpenCV + +Create a virtual environment for use with PoseVis: + + sudo apt install python3.8-venv -y && python3 -m venv .venv && source .venv/bin/activate + +Install Numpy: + + python -m pip install numpy + +Install GStreamer: + + sudo apt install libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev libgstreamer-plugins-bad1.0-dev gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav gstreamer1.0-tools gstreamer1.0-x gstreamer1.0-gl gstreamer1.0-gtk3 -y + +Install required OpenCV libraries: + + sudo apt install cmake libgtk-3-dev python3-dev -y + +We'll be building and installing OpenCV and OpenCV Contrib 4.6.0. Clone the branches: + + sudo apt install git && git clone -b 4.6.0 --single-branch https://github.com/opencv/opencv.git && git clone -b 4.6.0 --single-branch https://github.com/opencv/opencv_contrib.git + +Create a build directory for OpenCV and `cd` into it: + + mkdir opencv/build && cd opencv/build + +Run CMake with the following config: + + cmake ../ \ + -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \ + -D PYTHON_DEFAULT_EXECUTABLE=$(which python3) \ + -D BUILD_EXAMPLES=OFF \ + -D INSTALL_C_EXAMPLES=OFF \ + -D INSTALL_PYTHON_EXAMPLES=OFF \ + -D BUILD_opencv_python2=OFF \ + -D PYTHON3_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") \ + -D PYTHON3_PACKAGES_PATH=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \ + -D PYTHON_LIBRARY=$(python -c "from distutils.sysconfig import get_config_var; from os.path import dirname, join; print(join(dirname(get_config_var('LIBPC')), get_config_var('LDLIBRARY')))") \ + -D BUILD_opencv_python3=ON + +Check GTK, V4L2, GStreamer, and Python3 status: + + -- GUI: GTK3 + -- GTK+: YES (ver 3.24.33) + + ... + + -- Video I/O: + -- GStreamer: YES (1.20.3) + -- v4l/v4l2: YES (linux/videodev2.h) + + ... + + -- Python 3: + -- Interpreter: /home/cody/.venv/bin/python3 (ver 3.8.15) + -- Libraries: /usr/lib/x86_64-linux-gnu/libpython3.8.so (ver 3.8.15) + -- numpy: /home/cody/.venv/lib/python3.8/site-packages/numpy/core/include (ver 1.23.4) + -- install path: /home/cody/.venv/lib/python3.8/site-packages/cv2/python-3.8 + +If everything looks good, build and install: + + make -j$(nproc) && sudo make install + +Fix CV2 Python package permissions: **replace *[user]* with your username** + + export CV2_PATH=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/cv2 \ + sudo chown -R [user] CV2_PATH \ + sudo chgrp -R [user] CV2_PATH \ + sudo chmod -R 775 CV2_PATH + +## Install MediaPipe + +Install MediaPipe from PyPi: + + python -m pip install mediapipe + +## Install LabGraph and Test + +Install LabGraph from PyPi: + + python -m pip install labgraph + +`cd` into your LabGraph installation, assuming you've installed it in your home directory: + + cd ~/labgraph/devices/webcam + +Make sure PoseVis with GStreamer integration works: + + python -m pose_vis.pose_vis --sources "videotestsrc ! video/x-raw, width=1280, height=720, framerate=30/1, format=BGR ! appsink" + +If all is well, you're now finished. Check [Using PoseVis](readme.md#using-posevis) for more usage examples. Enjoy using PoseVis! + diff --git a/devices/webcam/logging_example.ipynb b/devices/webcam/logging_example.ipynb new file mode 100644 index 00000000..6843d987 --- /dev/null +++ b/devices/webcam/logging_example.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b60bdb84", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:pose_vis.runner: building graph\n", + "INFO:pose_vis.runner: enabling extension: HandsExtension\n", + "INFO:pose_vis.runner: logging directory is C:\\Users\\das\\Desktop\\labgraph\\devices\\webcam\\logs\n", + "INFO:pose_vis.runner: running graph\n", + "WARNING:labgraph.graphs.graph:PoseVis has unused topics:\n", + "\t- STREAM/OUTPUT has no subscribers\n", + "This could mean that there are publishers and/or subscribers of Cthulhu streams that Labgraph doesn't know about, and/or that data in some topics is being discarded.\n" + ] + } + ], + "source": [ + "import os\n", + "from pose_vis.extensions.hands import HandsExtension\n", + "from pose_vis.runner import PoseVisConfig\n", + "from pose_vis.runners.source_runner import SourceStreamRunner, SourceStreamRunnerConfig\n", + "from pose_vis.utils import absolute_path\n", + "\n", + "# This runs the SourceStream node in pose_vis/streams/source_stream.py\n", + "# It supports videos, image directories, and camera devices\n", + "# Given a directory, it will load all images in the directory and run them through\n", + "# the graph until all are processed, then closes the graph\n", + "# We also enable data logging, outputting to logs/logging_example.h5\n", + "config = PoseVisConfig(\n", + " extensions = [HandsExtension()],\n", + " log_directory = f\"webcam{os.sep}logs\",\n", + " log_name = \"logging_example\",\n", + " enable_logging = True,\n", + " # 0 disables the Display node\n", + " display_framerate = 0,\n", + " stats_history_size = 0)\n", + "\n", + "# Each source will be ran until completion, in this case we only have 1 source\n", + "runner_config = SourceStreamRunnerConfig(\n", + " sources = [absolute_path(f\"webcam{os.sep}images\")],\n", + " # Since we're loading images from a directory the resolution doesn't apply,\n", + " # but we need to specify a framerate regardless\n", + " resolutions = [(0, 0, 30)])\n", + "\n", + "# Build and run the graph\n", + "runner = SourceStreamRunner(config, runner_config)\n", + "runner.build()\n", + "# Other nodes could be inserted into the graph here, check pose_vis/dynamic_graph.py\n", + "runner.run()\n", + "# Unfortunately Jupyter doesn't grab the child process output, so we're missing the rest of the console output here\n", + "# Unused topic warnings can be ignored" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5c95e7a1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[landmark {\n", + " x: 0.3666782\n", + " y: 0.6836816\n", + " z: 1.3609244e-06\n", + "}\n", + "landmark {\n", + " x: 0.40242448\n", + " y: 0.667752\n", + " z: -0.18654017\n", + "}\n", + "landmark {\n", + " x: 0.39794287\n", + " y: 0.5872192\n", + " z: -0.24172299\n", + "}\n", + "landmark {\n", + " x: 0.40340263\n", + " y: 0.49869713\n", + " z: -0.24058115\n", + "}\n", + "landmark {\n", + " x: 0.3927693\n", + " y: 0.40063044\n", + " z: -0.22521572\n", + "}\n", + "landmark {\n", + " x: 0.27260906\n", + " y: 0.47896463\n", + " z: -0.25453687\n", + "}\n", + "landmark {\n", + " x: 0.25475854\n", + " y: 0.33122495\n", + " z: -0.31093124\n", + "}\n", + "landmark {\n", + " x: 0.24936885\n", + " y: 0.23572053\n", + " z: -0.34202737\n", + "}\n", + "landmark {\n", + " x: 0.26004127\n", + " y: 0.16758552\n", + " z: -0.3568705\n", + "}\n", + "landmark {\n", + " x: 0.2801903\n", + " y: 0.47701693\n", + " z: -0.14896862\n", + "}\n", + "landmark {\n", + " x: 0.30427077\n", + " y: 0.33153397\n", + " z: -0.19376588\n", + "}\n", + "landmark {\n", + " x: 0.37751722\n", + " y: 0.2612969\n", + " z: -0.21913196\n", + "}\n", + "landmark {\n", + " x: 0.4529705\n", + " y: 0.23412205\n", + " z: -0.23570204\n", + "}\n", + "landmark {\n", + " x: 0.29964173\n", + " y: 0.47477144\n", + " z: -0.049662407\n", + "}\n", + "landmark {\n", + " x: 0.3382794\n", + " y: 0.3543827\n", + " z: -0.105578855\n", + "}\n", + "landmark {\n", + " x: 0.41483483\n", + " y: 0.31347528\n", + " z: -0.16686752\n", + "}\n", + "landmark {\n", + " x: 0.48280495\n", + " y: 0.2856946\n", + " z: -0.20450525\n", + "}\n", + "landmark {\n", + " x: 0.32483783\n", + " y: 0.4750077\n", + " z: 0.041422643\n", + "}\n", + "landmark {\n", + " x: 0.3839665\n", + " y: 0.44780323\n", + " z: -0.016068315\n", + "}\n", + "landmark {\n", + " x: 0.45829943\n", + " y: 0.46123245\n", + " z: -0.070152506\n", + "}\n", + "landmark {\n", + " x: 0.5171263\n", + " y: 0.47498128\n", + " z: -0.10491397\n", + "}\n", + "]\n" + ] + } + ], + "source": [ + "import os\n", + "import cv2\n", + "import matplotlib.pyplot as plt\n", + "from labgraph.loggers.hdf5.reader import HDF5Reader\n", + "from pose_vis.utils import absolute_path\n", + "from pose_vis.streams.messages import CaptureResult\n", + "from pose_vis.extension import ExtensionResult\n", + "from pose_vis.extensions.hands import HandsExtension\n", + "\n", + "# Let's load the hdf5 log\n", + "path = absolute_path(f\"webcam{os.sep}logs{os.sep}logging_example.h5\")\n", + "# All data is logged under the \"captures\" dataset\n", + "log_types = {\"captures\": CaptureResult}\n", + "reader = HDF5Reader(path, log_types)\n", + "\n", + "# Since the images directory only contains 1 image, there should only be 1 set of messages\n", + "message: CaptureResult = reader.logs[\"captures\"][0]\n", + "\n", + "# Draw an overlay using data created by HandsExtension\n", + "# The captures attribute is a list of every source given to the runner\n", + "original_image = message.captures[0].frame\n", + "overlayed_image = original_image.copy()\n", + "HandsExtension.draw_overlay(\n", + " overlayed_image,\n", + " # Like captures, extensions holds results per-stream\n", + " ExtensionResult(data = message.extensions[0][\"HandsExtension\"]))\n", + "\n", + "# We should convert from BGR to RGB for matplotlib\n", + "original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)\n", + "overlayed_image = cv2.cvtColor(overlayed_image, cv2.COLOR_BGR2RGB)\n", + "\n", + "# Let's take a look at the images\n", + "fig = plt.figure()\n", + "fig.add_subplot(1, 2, 1)\n", + "plt.imshow(original_image)\n", + "plt.axis(\"off\")\n", + "plt.title(\"Original\")\n", + "fig.add_subplot(1, 2, 2)\n", + "plt.imshow(overlayed_image)\n", + "plt.axis(\"off\")\n", + "plt.title(\"Overlayed\")\n", + "plt.show()\n", + "\n", + "# We can view the data that the hands extension produced\n", + "print(message.extensions[0][\"HandsExtension\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8a27049", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "vscode": { + "interpreter": { + "hash": "23a593576959775a19d6469cad78770ba03ee1b7699646fbac2d14539ad9dcf0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/devices/webcam/pose_vis/__init__.py b/devices/webcam/pose_vis/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/benchmark.py b/devices/webcam/pose_vis/benchmark.py new file mode 100644 index 00000000..cb9115d1 --- /dev/null +++ b/devices/webcam/pose_vis/benchmark.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import asyncio +import logging +import time +import multiprocessing +import labgraph as lg + +from queue import Queue +from pose_vis.streams.messages import Capture, CaptureResult, ExitSignal +from pose_vis.benchmark_worker import BenchmarkWorker +from typing import Optional, List + +logger = logging.getLogger(__name__) + +class CapturePoint(): + __slots__ = "rec_time", "captures" + + def __init__(self, rec_time: float, captures: List[Capture]) -> None: + self.rec_time = rec_time + self.captures = captures + +class BenchmarkConfig(lg.Config): + """ + Config for `Benchmark` + + Attributes: + `output_path`: `str` path to output results + `output_name`: `str` output filename (no extension) + `run_time`: `int` how long to benchmark for + """ + output_path: str + output_name: str + run_time: int + +class BenchmarkState(lg.State): + """ + State for `Benchmark` + + Attributes: + `start_time`: `float` + `done`: `bool` + `tasks`: `Optional[multiprocessing.JoinableQueue]` + `worker`: `Optional[BenchmarkWorker]` + """ + start_time: float = 0.0 + done: bool = False + points: Optional[Queue] = None + tasks: Optional[multiprocessing.JoinableQueue] = None + worker: Optional[BenchmarkWorker] = None + +class Benchmark(lg.Node): + """ + Records timestamps from `CaptureResult` + + Topics: + `INPUT`: `CaptureResult` + `OUTPUT_EXIT`: `ExitSignal` + """ + INPUT = lg.Topic(CaptureResult) + OUTPUT_EXIT = lg.Topic(ExitSignal) + state: BenchmarkState + config: BenchmarkConfig + + def setup(self) -> None: + logger.info(f" benchmarking for {self.config.run_time} seconds") + self.state.points = Queue() + self.state.tasks = multiprocessing.JoinableQueue() + self.state.worker = BenchmarkWorker(self.state.tasks, 0, self.config.output_path, self.config.output_name) + self.state.worker.start() + + @lg.publisher(OUTPUT_EXIT) + async def on_done(self) -> lg.AsyncPublisher: + while True: + point: CapturePoint = None + try: + point = self.state.points.get_nowait() + except: + pass + if point is not None: + cap0 = point.captures[0] + self.state.tasks.put(( + point.rec_time, + (cap0.frame_index, cap0.proc_runtime, cap0.proc_target_fps, cap0.system_timestamp), + [point.captures[i].system_timestamp for i in range(1, len(point.captures))])) + elif self.state.done: + break + await asyncio.sleep(0.005) + yield self.OUTPUT_EXIT, ExitSignal() + + @lg.subscriber(INPUT) + async def on_msg(self, message: CaptureResult) -> None: + rec_time = time.perf_counter() + if self.state.start_time == 0: + self.state.start_time = rec_time + + if not self.state.done and rec_time - self.state.start_time >= self.config.run_time: + self.state.done = True + elif rec_time - self.state.start_time > 5: + captures = message.captures[:] + # We're ignoring the first 5 seconds to give the graph time to stabilize and get a more accurate result + self.state.points.put(CapturePoint(rec_time, captures)) + + def cleanup(self) -> None: + logger.info(" closing worker...") + self.state.tasks.put(None) + self.state.tasks.join() \ No newline at end of file diff --git a/devices/webcam/pose_vis/benchmark_worker.py b/devices/webcam/pose_vis/benchmark_worker.py new file mode 100644 index 00000000..bb45ae2b --- /dev/null +++ b/devices/webcam/pose_vis/benchmark_worker.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import os +import json +import logging +import multiprocessing + +from typing import List +from typing import Dict, List, Tuple, Union + +from pose_vis.streams.messages import Capture + +logger = logging.getLogger(__name__) + +class BenchmarkWorker(multiprocessing.Process): + """ + Handles capturing performance metrics without skewing results + """ + + tasks: multiprocessing.JoinableQueue + worker_number: int + output_path: str + output_name: str + start_time = 0.0 + output: Dict[str, Union[float, int, List[Tuple[float, float]], List[List[float]]]] = {"runtime": 0.0, "target_fps": 0, "frame_index": 0, "times": [], "sync": []} + + def __init__(self, tasks: multiprocessing.JoinableQueue, worker_number: int, output_path: str, output_name: str): + self.tasks = tasks + self.worker_number = worker_number + self.output_path = output_path + self.output_name = output_name + super().__init__() + + def run(self): + logger.info(f" worker {self.worker_number}: started") + while True: + val: Tuple[float, Tuple[int, float, int, float], List[float]] = self.tasks.get() + if val is not None: + self.output["runtime"] = val[1][1] + self.output["target_fps"] = val[1][2] + self.output["frame_index"] = val[1][0] + self.output["times"].append((val[1][3], val[0])) + if len(val[2]) > 0: + self.output["sync"].append(val[2]) + self.tasks.task_done() + else: + self.tasks.task_done() + logger.info(f" saving timings to {self.output_path}") + with open(os.path.join(self.output_path, f"{self.output_name}.json"), "w") as output: + output.write(json.dumps(self.output)) + logger.info(f" worker {self.worker_number}: shutting down") + break \ No newline at end of file diff --git a/devices/webcam/pose_vis/display.py b/devices/webcam/pose_vis/display.py new file mode 100644 index 00000000..4576fc03 --- /dev/null +++ b/devices/webcam/pose_vis/display.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +import os +if os.name == "nt": + # Improve sleep timer resolution for this process on Windows + # https://learn.microsoft.com/en-us/windows/win32/api/timeapi/nf-timeapi-timebeginperiod + import ctypes + winmm = ctypes.WinDLL('winmm') + winmm.timeBeginPeriod(1) + +import logging +import time +import labgraph as lg + +from typing import Optional, Dict +from pose_vis.performance_utility import PerfUtility +from pose_vis.streams.messages import CaptureResult, ExitSignal +from pose_vis.display_handler import DisplayHandler + +logger = logging.getLogger(__name__) + +class DisplayConfig(lg.Config): + """ + Config for Display node + + Attributes: + `target_framerate`: `int` target framerate for updating CV2 windows + `stats_history_size`: `int` how many frame stats to remember + `extension_types`: `Dict[str, type]` type lookup for enabled extensions + """ + target_framerate: int + stats_history_size: int + extension_types: Dict[str, type] + +class DisplayState(lg.State): + """ + State for Display node + + Attributes: + `handler`: `DisplayHandler` + `running`: `bool` + `perf`: `PerfUtility` + """ + handler: Optional[DisplayHandler] = None + running: bool = True + perf: PerfUtility = PerfUtility() + +class Display(lg.Node): + """ + Draws overlays and presents them + + Topics: + `INPUT`: `CaptureResult` + + Attributes: + `state`: `DisplayState` + `config`: `DisplayConfig` + """ + INPUT = lg.Topic(CaptureResult) + INPUT_EXIT_STREAM = lg.Topic(ExitSignal) + INPUT_EXIT_USER = lg.Topic(ExitSignal) + state: DisplayState + config: DisplayConfig + + def setup(self) -> None: + self.state.handler = DisplayHandler(self.config.stats_history_size, self.config.extension_types) + self.state.handler.register_key_callback(self.on_key) + + @lg.subscriber(INPUT) + async def update(self, message: CaptureResult) -> None: + self.state.handler.update_frames(message.captures[:], message.extensions[:]) + + @lg.subscriber(INPUT_EXIT_STREAM) + async def on_exit_stream(self, _: ExitSignal) -> None: + self.state.running = False + + @lg.subscriber(INPUT_EXIT_USER) + async def on_exit_user(self, _: ExitSignal) -> None: + self.state.running = False + + def on_key(self, key: int) -> None: + if key == 27: + self.state.running = False + + @lg.main + def display(self) -> None: + while self.state.running: + self.state.perf.update_start() + self.state.handler.update_windows(self.state.perf.updates_per_second) + time.sleep(self.state.perf.get_remaining_sleep_time(self.config.target_framerate)) + self.state.perf.update_end() + + raise lg.NormalTermination() + + def cleanup(self): + self.state.handler.cleanup() \ No newline at end of file diff --git a/devices/webcam/pose_vis/display_handler.py b/devices/webcam/pose_vis/display_handler.py new file mode 100644 index 00000000..b285f0f7 --- /dev/null +++ b/devices/webcam/pose_vis/display_handler.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import multiprocessing +import cv2 +import time +import collections +import numpy as np + +from typing import Callable, List, Dict, Deque, Any +from pose_vis.extension import PoseVisExtension, ExtensionResult +from pose_vis.streams.messages import Capture +from pose_vis.stats_worker import StatsWorker, CapturePoint, CaptureStats + +class DisplayHandler(): + """ + Handles image display, drawing, and CV2 key events + """ + history_size: int + extension_types: Dict[str, type] + key_callbacks: List[Callable[[int], None]] + post_render_callbacks: List[Callable[[int, np.ndarray], None]] + tasks: multiprocessing.JoinableQueue + results: multiprocessing.Queue + worker: StatsWorker + stats: CaptureStats = None + captures: Deque[Capture] = None + extensions: Deque[Dict[str, Any]] = None + + def __init__(self, history_size: int, extension_types: Dict[str, type]) -> None: + self.history_size = history_size + self.extension_types = extension_types + self.key_callbacks = [] + self.post_render_callbacks = [] + if self.history_size > 0: + self.tasks = multiprocessing.JoinableQueue() + self.results = multiprocessing.Queue() + self.worker = StatsWorker(self.tasks, self.results, self.history_size) + self.worker.start() + + def register_key_callback(self, method: Callable[[int], None]) -> None: + """ + Registers a callback to be called on successful `cv2.waitKey()` + """ + self.key_callbacks.append(method) + + def register_post_render_callback(self, method: Callable[[int, np.ndarray], None]) -> None: + """ + Registers a callback to be called after rendering extension data + """ + self.post_render_callbacks.append(method) + + def update_frames(self, captures: List[Capture], extensions: List[Dict[str, Any]]) -> None: + """ + Update list of currently presented frames and extension data + """ + received = time.perf_counter() + if self.captures is None: + num_sources = len(captures) + self.captures = collections.deque(maxlen = num_sources) + self.extensions = collections.deque(maxlen = num_sources) + + self.captures.extend(captures) + self.extensions.extend(extensions) + + if self.history_size > 0: + first_cap = self.captures[0] + self.tasks.put(CapturePoint( + first_cap.frame_index, + first_cap.proc_runtime, + first_cap.proc_target_fps, + received, + [cap.system_timestamp for cap in self.captures])) + + def update_windows(self, framerate: int) -> None: + """ + Update CV2 windows and process key presses + + `framerate` is for displaying the current framerate in the window title. Set to 0 to disable + """ + if self.history_size > 0: + res = None + try: + res = self.results.get_nowait() + self.stats = res + except: + pass + + if self.captures is not None: + for i in range(len(self.captures)): + cap: Capture = self.captures[i] + title = f"PoseVis source {cap.stream_id}" + frame = cv2.cvtColor(cap.frame.copy(), cv2.COLOR_RGB2BGR) + if len(self.extensions) > 0: + ext = self.extensions[i] + for key in ext: + _type: PoseVisExtension = self.extension_types[key] + _type.draw_overlay(frame, ExtensionResult(data = ext[key])) + for callback in self.post_render_callbacks: + callback(i, frame) + cv2.imshow(title, frame) + + display_info = f"| display: {framerate}fps" if framerate > 0 else "" + if self.history_size > 0 and self.stats != None: + desync_string = f" desync: {(self.stats.desync[i - 1] * 1000):05.2f}ms," if i > 0 and len(self.stats.desync) > 0 else "" + source_info = f": {cap.proc_fps}fps, latency: {(self.stats.latency * 1000):05.2f}ms, jitter: {(self.stats.jitter * 1000):05.2f}ms,{desync_string} dropped: {(100 - self.stats.framedrop):05.2f}%" + cv2.setWindowTitle(title, f"{title} {source_info} {display_info}") + else: + source_info = f": {cap.proc_fps}fps" + cv2.setWindowTitle(title, f"{title} {source_info} {display_info}") + + key = cv2.waitKey(1) + if key != -1: + for callback in self.key_callbacks: + callback(key) + + def cleanup(self) -> None: + """ + Must be called during program shutdown + """ + if self.history_size > 0: + self.tasks.put(None) + self.worker.join() + cv2.destroyAllWindows() + diff --git a/devices/webcam/pose_vis/dynamic_graph.py b/devices/webcam/pose_vis/dynamic_graph.py new file mode 100644 index 00000000..40af12ed --- /dev/null +++ b/devices/webcam/pose_vis/dynamic_graph.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import labgraph as lg +from typing import List, Tuple, Dict + +class DynamicGraph(lg.Graph): + + """ + DynamicGraph, allows you to construct a Graph object based on run-time parameters. + `class ExampleGraph(DynamicGraph):` + + Use ExampleGraph.add_node() to register nodes. + The `connections` parameter is expected to be a list with 4 strings: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + + Example: + `ExampleGraph.add_node("TestNode2", TestNode2, ["TestNode2", "INPUT", "TestNode1", "OUTPUT", TestNode2Config(...)])` + """ + + _connections: List[List[str]] = [] + _logger_connections: List[Tuple[str, str, str]] = [] + _configs: dict = {} + _cls: type = None + + @classmethod + def add_node(cls, name: str, _type: type, connection: List[str] = None, config: lg.Config = None) -> None: + """ + Add a node to the graph + + `name`: `str` the node's desired variable name + `_type`: `type` the node's type + `connection`: `List[str]` optional, expected to have a length of 4 + + Example: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + + `config`: `lg.Config` optional, config object to be given to this node during setup + """ + setattr(cls, name, None) + cls.__annotations__[name] = _type + cls.__children_types__[name] = _type + + if connection: + cls._connections.append(connection) + + if config: + cls._configs[name] = config + + @classmethod + def add_connection(cls, connection: List[str]) -> None: + """ + Add a connection between two nodes + + `connection`: `List[str]` expected to have a length of 4 + + Example: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + """ + cls._connections.append(connection) + + @classmethod + def add_logger_connection(cls, connection: Tuple[str, str, str]) -> None: + """ + Add a connection to the logger + + `connection`: `Tuple[str, str, str]` + + Example: + `(logged stream path, node variable name, node output variable name)` + """ + cls._logger_connections.append(connection) + + def setup(self) -> None: + for key in type(self)._configs: + self.__getattribute__(key).configure(type(self)._configs[key]) + + def connections(self) -> lg.Connections: + cons = [] + for con_list in type(self)._connections: + node1: lg.Node = self.__getattribute__(con_list[0]) + node2: lg.Node = self.__getattribute__(con_list[2]) + cons.append((node1.__getattribute__(con_list[1]), node2.__getattribute__(con_list[3]))) + return tuple(cons) + + def logging(self) -> Dict[str, lg.Topic]: + _dict = {} + for con in type(self)._logger_connections: + _dict[con[0]] = self.__getattribute__(con[1]).__getattribute__(con[2]) + return _dict + + def process_modules(self) -> Tuple[lg.Module, ...]: + mods = () + for key in type(self).__children_types__: + mods += (self.__getattribute__(key),) + return mods + +class DynamicGroup(lg.Group): + + """ + DynamicGroup, allows you to construct a Group object based on run-time parameters. + `class ExampleGroup(DynamicGroup):` + + Use ExampleGroup.add_node() to register nodes. + The `connections` parameter is expected to be a list with 4 strings: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + + Example: + `DynamicGroup.add_node("TestNode2", TestNode2, ["TestNode2", "INPUT", "TestNode1", "OUTPUT", TestNode2Config(...)])` + """ + + _connections: List[List[str]] = [] + _configs: dict = {} + + @classmethod + def add_node(cls, name: str, _type: type, connection: List[str] = None, config: lg.Config = None) -> None: + """ + Add a node to the group + + `name`: `str` the node's desired variable name + `_type`: `type` the node's type + `connection`: `List[str]` optional, expected to have a length of 4 + + Example: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + + `config`: `lg.Config` optional, config object to be given to this node during setup + """ + setattr(cls, name, None) + cls.__annotations__[name] = _type + cls.__children_types__[name] = _type + + if connection: + cls._connections.append(connection) + + if config: + cls._configs[name] = config + + @classmethod + def add_connection(cls, connection: List[str]) -> None: + """ + Add a connection between two nodes + + `connection`: `List[str]` expected to have a length of 4 + + Example: + `[Node1Name, Node1TopicName, Node2Name, Node2TopicName]` + """ + cls._connections.append(connection) + + @classmethod + def add_topic(cls, name: str, topic: lg.Topic) -> None: + """ + Add a topic object + + `name`: `str` the variable's name + `topic`: `lg.Topic` the topic object to add + """ + setattr(cls, name, topic) + + def setup(self) -> None: + for key in type(self)._configs: + self.__getattribute__(key).configure(type(self)._configs[key]) + + def connections(self) -> lg.Connections: + cons = [] + for con_list in type(self)._connections: + node1: lg.Node = self.__getattribute__(con_list[0]) + node2: lg.Node = self.__getattribute__(con_list[2]) + cons.append((node1.__getattribute__(con_list[1]), node2.__getattribute__(con_list[3]))) + return tuple(cons) \ No newline at end of file diff --git a/devices/webcam/pose_vis/extension.py b/devices/webcam/pose_vis/extension.py new file mode 100644 index 00000000..9c9ba83f --- /dev/null +++ b/devices/webcam/pose_vis/extension.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import numpy as np + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from argparse import ArgumentParser, Namespace +from typing import Any + +@dataclass +class ExtensionResult(): + """ + Produced by `PoseVisExtension` + + Attributes: + `data`: `Any` + """ + data: Any + +class PoseVisExtensionBase(ABC): + """ + Abstract base class for `PoseVisExtension` + + Abstract methods: + `register_args(self, parser: ArgumentParser) -> None` + + `check_enabled(self, args: Namespace) -> bool` + + `setup(self) -> None` + + `process_frame(self, frame: np.ndarray) -> ExtensionResult` + + `cleanup(self) -> None` + + `draw_overlay(cls, result: ExtensionResult) -> None` + + `check_output(cls, result: ExtensionResult) -> bool` + """ + @abstractmethod + def register_args(self, parser: ArgumentParser) -> None: + """ + Called before graph initialization and argument parsing + + Use this to register an argument that will allow this extension to be enabled or disabled + """ + raise NotImplementedError + + @abstractmethod + def check_enabled(self, args: Namespace) -> bool: + """ + Check the `ArgumentParser.parse_args()` result to determine if this extension should be enabled + """ + raise NotImplementedError + + @abstractmethod + def setup(self) -> None: + """ + Called on video stream setup + """ + pass + + @abstractmethod + def process_frame(self, frame: np.ndarray) -> ExtensionResult: + """ + Called once per frame inside of a video stream node + """ + raise NotImplementedError + + @abstractmethod + def cleanup(self) -> None: + """ + Called on graph shutdown + """ + pass + + @classmethod + @abstractmethod + def draw_overlay(cls, frame: np.ndarray, result: ExtensionResult) -> None: + """ + Called upon displaying extension results + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def check_output(cls, result: ExtensionResult) -> bool: + """ + Method for extensions to check their output via assertions + + Called during test execution + """ + raise NotImplementedError + +class PoseVisExtension(PoseVisExtensionBase): + """ + An extension of the base class that Pose Vis uses to automatically initialize the following variables: + + Attributes: + `extension_id`: `int`, a contiguous identifier for each enabled extension + + Methods: + `set_enabled(self, extension_id: int) -> None` + """ + extension_id: int + + def set_enabled(self, extension_id: int) -> None: + """ + Called if this extension passes the `check_enabled` method + """ + self.extension_id = extension_id diff --git a/devices/webcam/pose_vis/extensions/__init__.py b/devices/webcam/pose_vis/extensions/__init__.py new file mode 100644 index 00000000..c05b5efe --- /dev/null +++ b/devices/webcam/pose_vis/extensions/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +from pose_vis.extensions import hands +from pose_vis.extensions import face_detection +from pose_vis.extensions import face_mesh +from pose_vis.extensions import pose +from pose_vis.extensions import holistic diff --git a/devices/webcam/pose_vis/extensions/face_detection.py b/devices/webcam/pose_vis/extensions/face_detection.py new file mode 100644 index 00000000..28ecac9b --- /dev/null +++ b/devices/webcam/pose_vis/extensions/face_detection.py @@ -0,0 +1,73 @@ +import logging +import cv2 +import numpy as np +import mediapipe as mp +# Import MediaPipe types for intellisense +import mediapipe.python.solutions.face_detection as FaceType +import mediapipe.python.solutions.objectron as ObjectType +import mediapipe.python.solutions.drawing_utils as DrawingUtilsType +import mediapipe.python.solutions.drawing_styles as DrawingStylesType +from mediapipe.framework.formats.landmark_pb2 import NormalizedLandmarkList #? <-- + +from pose_vis.extension import PoseVisExtension, ExtensionResult +from argparse import ArgumentParser, Namespace + +from typing import Optional, Tuple + +logger = logging.getLogger(__name__) + +mp_drawing: DrawingUtilsType = mp.solutions.drawing_utils +mp_drawing_styles: DrawingStylesType = mp.solutions.drawing_styles +mp_face: FaceType = mp.solutions.face_detection +mp_object: ObjectType = mp.solutions.objectron #! <--- for object tracking - testing + +class FaceDetectionExtension(PoseVisExtension): + face : Optional[FaceType.FaceDetection] + object_tracking : Optional[ObjectType.Objectron] #! <---- object tracking - testing + + # argument to enable or disable the face detection extension + def register_args(self, parser: ArgumentParser) -> None: + parser.add_argument("--face_detection", help="enable the face detection extension", action="store_true", required=False) + + def check_enabled(self, args: Namespace) -> bool: + return args.face_detection + + def setup(self) -> None: + self.face = mp_face.FaceDetection() + self.object_tracking = mp_object.Objectron() + + def process_frame(self, frame: np.ndarray) -> Tuple[np.ndarray, ExtensionResult]: + # convert from BGR to RGB + #? NormalizedDetectionList + mp_results = self.face.process(frame).detections + + # check if a face detection list is null + if mp_results is None: + mp_results = [] + + return ExtensionResult(data=mp_results) + + @classmethod + def draw_overlay(cls, frame: np.ndarray, result: ExtensionResult): + + for detection in result.data: + mp_drawing.draw_detection( + frame, + detection + ) + + @classmethod + def check_output(cls, result: ExtensionResult)-> bool: + if len(result.data) > 0: + for i in range(result.data): + if len(result.data[i]) != 6: + logger.warning(f'index {i} in result.data is not proper length') + return False + return True + else: + logger.warning(" result is empty") + return False + + # clean up called when the graph is shutdown + def cleanup(self) -> None: + pass diff --git a/devices/webcam/pose_vis/extensions/face_mesh.py b/devices/webcam/pose_vis/extensions/face_mesh.py new file mode 100644 index 00000000..0caa3106 --- /dev/null +++ b/devices/webcam/pose_vis/extensions/face_mesh.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging +import cv2 +import numpy as np +import mediapipe as mp +# Import MediaPipe types for intellisense +import mediapipe.python.solutions.face_mesh as FaceType +import mediapipe.python.solutions.drawing_utils as DrawingUtilsType +import mediapipe.python.solutions.drawing_styles as DrawingStylesType +from mediapipe.framework.formats.landmark_pb2 import NormalizedLandmarkList + +# Every extension will probably need these imports +from pose_vis.extension import PoseVisExtension, ExtensionResult +from argparse import ArgumentParser, Namespace + +from typing import Optional, Tuple + +logger = logging.getLogger(__name__) + +# MediaPipe setup: https://google.github.io/mediapipe/solutions/hands.html +mp_drawing: DrawingUtilsType = mp.solutions.drawing_utils +mp_drawing_styles: DrawingStylesType = mp.solutions.drawing_styles +mp_face_mesh: FaceType = mp.solutions.face_mesh + +# drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + +class FaceMeshExtension(PoseVisExtension): + drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + + face_mesh: Optional[FaceType.FaceMesh] + + def register_args(self, parser: ArgumentParser) -> None: + parser.add_argument("--face_mesh", help="enable face mesh extension", action="store_true", required=False) + + def check_enabled(self, args: Namespace) -> bool: + return args.face_mesh + + def setup(self) -> None: + # drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + self.face_mesh = mp_face_mesh.FaceMesh(max_num_faces=1,refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5) + + def process_frame(self, frame: np.ndarray) -> Tuple[np.ndarray, ExtensionResult]: + + mp_results = self.face_mesh.process(frame).multi_face_landmarks + + if mp_results is None: + mp_results = [] + + return ExtensionResult(data=mp_results) + + @classmethod + def draw_overlay(cls, frame: np.ndarray, result: ExtensionResult): + # drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) + + + for landmark_list in result.data: + mp_drawing.draw_landmarks( + frame, + landmark_list, + mp_face_mesh.FACEMESH_TESSELATION, + None, + mp_drawing_styles.get_default_face_mesh_tesselation_style() + ) + mp_drawing.draw_landmarks( + image = frame, + landmark_list = landmark_list, + connections = mp_face_mesh.FACEMESH_CONTOURS, + landmark_drawing_spec = None, + connection_drawing_spec = mp_drawing_styles.get_default_face_mesh_contours_style() + ) + mp_drawing.draw_landmarks( + image = frame, + landmark_list = landmark_list, + connections = mp_face_mesh.FACEMESH_IRISES, + landmark_drawing_spec = None, + connection_drawing_spec = mp_drawing_styles.get_default_face_mesh_iris_connections_style() + ) + + + @classmethod + def check_output(cls, result:ExtensionResult) -> bool: + # check the result of 'face_mesh.process' + if len(result.data) > 0: + for i in range(result.data): + if len(result.data[i]) != 468: + logger.warning(f"index {i} in result.data is not proper length") + return False + return True + else: + logger.warning("result is empty") + return False + + + def cleanup(self) -> None: + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/extensions/hands.py b/devices/webcam/pose_vis/extensions/hands.py new file mode 100644 index 00000000..b7444893 --- /dev/null +++ b/devices/webcam/pose_vis/extensions/hands.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging +import numpy as np +import mediapipe as mp + +# Import MediaPipe types for intellisense +import mediapipe.python.solutions.hands as HandsType +import mediapipe.python.solutions.drawing_utils as DrawingUtilsType +import mediapipe.python.solutions.drawing_styles as DrawingStylesType + +# Every extension will probably need these imports +from pose_vis.extension import PoseVisExtension, ExtensionResult +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass + +from typing import Optional, Tuple + +logger = logging.getLogger(__name__) + +# MediaPipe setup: https://google.github.io/mediapipe/solutions/hands.html +mp_drawing: DrawingUtilsType = mp.solutions.drawing_utils +mp_drawing_styles: DrawingStylesType = mp.solutions.drawing_styles +mp_hands: HandsType = mp.solutions.hands + +@dataclass +class HandsConfig(): + max_num_hands: int = 2 + model_complexity: int = 0 + min_detection_confidence: int = 0.5 + min_tracking_confidence: int = 0.5 + +# This class is instantiated by Pose Vis automatically +# You must import this file in extensions/__init__.py for it to be recognized +class HandsExtension(PoseVisExtension): + # Optional here since this class will be serialized to each stream node + # otherwise we'll get a "cannot pickle" AttributeError + hands: Optional[HandsType.Hands] + config: HandsConfig + + def __init__(self, config: HandsConfig = HandsConfig()) -> None: + self.config = config + super().__init__() + + # Register an argument that allows the user to enable this extension + def register_args(self, parser: ArgumentParser): + parser.add_argument("--hands", help = "enable the hand tracking extension", action = "store_true", required = False) + + # Tell Pose Vis if this extension is enabled or not + def check_enabled(self, args: Namespace) -> bool: + return args.hands + + # Called when the stream is initialized + def setup(self) -> None: + # TODO: a way to expose MediaPipe configs + self.hands = mp_hands.Hands( + max_num_hands = self.config.max_num_hands, + model_complexity = self.config.model_complexity, + min_detection_confidence = self.config.min_detection_confidence, + min_tracking_confidence = self.config.min_tracking_confidence) + + # Called from `FrameProcessor` on each new frame from the stream + def process_frame(self, frame: np.ndarray) -> Tuple[np.ndarray, ExtensionResult]: + mp_results = self.hands.process(frame) + + # Convert different results to a dict for easier access + results = {"multi_hand_landmarks": [], "multi_hand_world_landmarks": [], "multi_handedness": []} + if mp_results.multi_hand_landmarks is not None: + results["multi_hand_landmarks"] = mp_results.multi_hand_landmarks + if mp_results.multi_hand_world_landmarks is not None: + results["multi_hand_world_landmarks"] = mp_results.multi_hand_world_landmarks + if mp_results.multi_handedness is not None: + results["multi_handedness"] = mp_results.multi_handedness + + return ExtensionResult(data = results) + + @classmethod + def draw_overlay(cls, frame: np.ndarray, result: ExtensionResult) -> None: + # Draw the detected hand landmarks onto the image + for landmark_list in result.data["multi_hand_landmarks"]: + mp_drawing.draw_landmarks( + frame, + landmark_list, + mp_hands.HAND_CONNECTIONS, + mp_drawing_styles.get_default_hand_landmarks_style(), + mp_drawing_styles.get_default_hand_connections_style()) + + @classmethod + def check_output(cls, result: ExtensionResult) -> bool: + """ + Checks the results of `hands.process()`, assuming that at least one hand is fully visible in the frame + """ + if "multi_hand_landmarks" in result.data: + for i in range(len(result.data["multi_hand_landmarks"])): + if len(result.data["multi_hand_landmarks"][i].landmark) != 21: + logger.warning(f" index {i} in result.data is not proper length") + return False + return True + else: + logger.warning(" result is empty") + return False + + # Called when the graph shuts down + def cleanup(self) -> None: + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/extensions/holistic.py b/devices/webcam/pose_vis/extensions/holistic.py new file mode 100644 index 00000000..2e4f6b5d --- /dev/null +++ b/devices/webcam/pose_vis/extensions/holistic.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging +import cv2 +import numpy as np +import mediapipe as mp +# Import MediaPipe types for intellisense +import mediapipe.python.solutions.holistic as HolisticType +import mediapipe.python.solutions.drawing_utils as DrawingUtilsType +import mediapipe.python.solutions.drawing_styles as DrawingStylesType +from mediapipe.framework.formats.landmark_pb2 import NormalizedLandmarkList + +# Every extension will probably need these imports +from pose_vis.extension import PoseVisExtension, ExtensionResult +from argparse import ArgumentParser, Namespace + +from typing import Optional, Tuple + +logger = logging.getLogger(__name__) + +# medaiapipe setup +mp_drawing: DrawingUtilsType = mp.solutions.drawing_utils +mp_drawing_styles: DrawingStylesType = mp.solutions.drawing_styles +mp_holistic: HolisticType = mp.solutions.holistic + +class HolisticExtension(PoseVisExtension): + holistic: Optional[HolisticType.Holistic] #! <-- test this + + def register_args(self, parser: ArgumentParser) -> None: + parser.add_argument('--holistic', help='enable holistic extension', action='store_true', required=False) + + def check_enabled(self, args: Namespace) -> bool: + return args.holistic + + def setup(self) -> None: + self.holistic = mp_holistic.Holistic() #! <-- test + + def process_frame(self, frame: np.ndarray) -> Tuple[np.ndarray, ExtensionResult]: + + result = self.holistic.process(frame) + + face_landmarks = result.face_landmarks + pose_landmarks = result.pose_landmarks + + if face_landmarks is None: + face_landmarks = [] + if pose_landmarks is None: + pose_landmarks = [] + + return ExtensionResult(data=result) + + @classmethod + def draw_overlay(cls, frame:np.ndarray, result: ExtensionResult): + + # draw holistic + #! test these two + mp_drawing.draw_landmarks( + frame, + result.face_landmarks, + mp_holistic.FACEMESH_CONTOURS, + None, #! <-- make sure this is correct + mp_drawing_styles.get_default_face_mesh_contours_style() + ) + + mp_drawing.draw_landmarks( + frame, + result.pose_landmarks, + mp_holistic.POSE_CONNECTIONS, + mp_drawing_styles.get_default_pose_landmarks_style() + ) + + + @classmethod + def check_output(cls, result:ExtensionResult) -> bool: + # recheck this function + if len(result.data)>0: + for i in range(result.data): + if len(result.data[i] != 510): #! <-- this number might not be correct + logger.warning(f'index {i} in result.data is not proper length') + return False + return True + else: + logger.warning('result is empty') + return False + + def cleanup(self) -> None: + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/extensions/pose.py b/devices/webcam/pose_vis/extensions/pose.py new file mode 100644 index 00000000..b2c64d56 --- /dev/null +++ b/devices/webcam/pose_vis/extensions/pose.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging +import numpy as np +import mediapipe as mp + +# Import MediaPipe types for intellisense +import mediapipe.python.solutions.pose as PoseType +import mediapipe.python.solutions.drawing_utils as DrawingUtilsType +import mediapipe.python.solutions.drawing_styles as DrawingStylesType +from mediapipe.framework.formats.landmark_pb2 import NormalizedLandmarkList + +# Every extension will probably need these imports +from pose_vis.extension import PoseVisExtension, ExtensionResult +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass + +from typing import Optional, Tuple + +logger = logging.getLogger(__name__) + +# MediaPipe setup: https://google.github.io/mediapipe/solutions/hands.html +mp_drawing: DrawingUtilsType = mp.solutions.drawing_utils +mp_drawing_styles: DrawingStylesType = mp.solutions.drawing_styles +mp_pose: PoseType = mp.solutions.pose + +@dataclass +class PoseConfig(): + model_complexity: int = 1 + smooth_landmarks: bool = True + enable_segmentation: bool = False + smooth_segmentation: bool = True + min_detection_confidence = 0.5 + min_tracking_confidence = 0.5 + +class PoseExtension(PoseVisExtension): + pose: Optional[PoseType.Pose] + config: PoseConfig + + def __init__(self, config: PoseConfig = PoseConfig()) -> None: + self.config = config + super().__init__() + + # register argument allowing user to run this extension + def register_args(self, parser: ArgumentParser) -> None: + parser.add_argument('--pose', help='enable pose estimation extension', action = 'store_true', required= False) + + # check to see if extension is enabled or not + def check_enabled(self, args: Namespace) -> bool: + return args.pose + + def setup(self) -> None: + self.pose = mp_pose.Pose( + model_complexity = self.config.model_complexity, + smooth_landmarks = self.config.smooth_landmarks, + enable_segmentation = self.config.enable_segmentation, + smooth_segmentation = self.config.smooth_segmentation, + min_detection_confidence = self.config.min_detection_confidence, + min_tracking_confidence = self.config.min_tracking_confidence + ) + + def process_frame(self, frame: np.ndarray) -> ExtensionResult: + + mp_result = self.pose.process(frame) + + results = {"pose_landmarks":[], "pose_world_landmarks":[]} + + if (mp_result.pose_landmarks is not None): + results["pose_landmarks"] = mp_result.pose_landmarks + elif(mp_result.pose_world_landmarks is not None): + results["pose_world_landmarks"] = mp_result.pose_world_landmarks + + return ExtensionResult(data=results) + + + @classmethod + def draw_overlay(cls, frame: np.ndarray, result: ExtensionResult): + + # for pose_landmark_list in result.data["pose_landmarks"]: + mp_drawing.draw_landmarks( + frame, + result.data["pose_landmarks"], + mp_pose.POSE_CONNECTIONS, + mp_drawing_styles.get_default_pose_landmarks_style() + ) + + @classmethod + def check_output(cls, result: ExtensionResult): + if len(result.data) > 0: + for i in range(result.data): + if len(result.data[i]) != 33: + logger.warning(f' index {i} in result is not proper length') + return False + return True + else: + logger.warning(' result is empty') + return False + + def cleanup(self) -> None: + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/__init__.py b/devices/webcam/pose_vis/gesture/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/gesture/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/hand/__init__.py b/devices/webcam/pose_vis/gesture/hand/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/hand/annotation.py b/devices/webcam/pose_vis/gesture/hand/annotation.py new file mode 100644 index 00000000..3aa775ea --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/annotation.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import os +import json +import numpy as np +import matplotlib.pyplot as plt + +from typing import Dict, List, Tuple +from json import JSONEncoder +from matplotlib.pyplot import Axes +from matplotlib.figure import Figure + +Vector = np.ndarray(shape=(0, 3), dtype=float) + +class NumpyJSONEncoder(JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + return JSONEncoder.default(self, obj) + +class Annotation(): + """ + Handles tracking of hand and gesture data + """ + hands: List[np.ndarray] = [] + gestures: Dict[str, List[np.ndarray]] = {} + + def set_hand_vertices(self, hand_index: int, vertices: Vector) -> None: + """ + Sets vertex data for a hand index. If the hand does not exist, it is created + """ + if np.size(self.hands, axis=0) <= hand_index: + self.hands.append(vertices) + else: + self.hands[hand_index] = vertices + + def clear_hand_vertices(self) -> None: + """ + Clears all hand vertex data + """ + self.hands.clear() + + def add_gesture_data(self, vertices: Vector, label: str) -> None: + """ + Adds gesture data to the given label, if the label doesn't exist, it is created + """ + if label not in self.gestures: + self.gestures[label] = [] + self.gestures[label].append(vertices) + + def save_gestures(self, file_path: str) -> None: + """ + Saves gesture data to the given file path. Must be a full path + """ + with open(file_path, "w") as output: + output.write(json.dumps(self.gestures, indent=4, cls=NumpyJSONEncoder)) + + def load_gestures(self, file_path: str) -> None: + """ + Loads gesture data from the given file path. Must be a full path + """ + if os.path.exists(file_path): + with open(file_path, "r") as _file: + gestures: Dict = json.load(_file) + for k, v in gestures.items(): + self.gestures[k] = [] + for vertex_list in v: + self.gestures[k].append(np.asarray(vertex_list)) + + def guess_annotations(self, max_difference_value: float) -> List[Tuple[str, float]]: + """ + Returns a list of labels and their difference values for each hand based on `max_difference_value` + + If no gesture is found, returns an empty string and the closest difference value + """ + results: List[Tuple[str, float]] = [] + for hdx in range(len(self.hands)): + differences: List[Tuple[str, float]] = [] + for label, vertices_list in self.gestures.items(): + for data_index in range(len(vertices_list)): + diff = 0.0 + for indice in range(len(vertices_list[data_index])): + for i in range(len(vertices_list[data_index][indice])): + diff += abs(self.hands[hdx][indice][i] - vertices_list[data_index][indice][i]) + differences.append((label, diff)) + differences.sort(key = lambda x: x[1]) + if len(differences) > 0 and differences[0][1] <= max_difference_value: + results.append(differences[0]) + else: + results.append(("", -1 if len(differences) == 0 else differences[0][1])) + return results + + def configure_plot(self) -> Tuple[Figure, Axes]: + """ + Configures a plot for plotting hand data + """ + fig = plt.figure() + ax = fig.add_subplot(projection="3d") + fig.subplots_adjust(left=0, right=1, bottom=0, top=1) + return (fig, ax) + + def plot_hand(self, hand_index: int, ax: Axes, drawing_order: List[List[int]], bounds: Tuple[int, int], hand_scale: float = 1.0, xyz_order: Tuple[int, int, int] = (0, 1, 2), xyz_scale: Tuple[int, int, int] = (1, 1, 1)) -> None: + """ + Plots hand data + + `drawing_order` must be a list of indice lists, where each indice is connected in order + + `bounds` is the size of the 3D scene + + `hand_scale` scales the raw hand data + + `xyz_order` allows switching which values are used for X, Y, and Z in the plot. `0` is X, `1` is Y, and `2` is Z + + `xyz_scale` scales the flipped X, Y, and Z values + """ + ax.cla() + ax.set_xlim3d(bounds[0], bounds[1]) + ax.set_ylim3d(bounds[0], bounds[1]) + ax.set_zlim3d(bounds[0], bounds[1]) + for indices in drawing_order: + handx = np.empty(shape=0) + handy = np.empty(shape=0) + handz = np.empty(shape=0) + for indice in indices: + world_pos = self.hands[hand_index][indice] * hand_scale + handx = np.append(handx, [world_pos[xyz_order[0]] * xyz_scale[0]], axis=0) + handy = np.append(handy, [world_pos[xyz_order[1]] * xyz_scale[1]], axis=0) + handz = np.append(handz, [world_pos[xyz_order[2]] * xyz_scale[2]], axis=0) + ax.plot(handx, handy, handz) \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/hand/data/gestures.json b/devices/webcam/pose_vis/gesture/hand/data/gestures.json new file mode 100644 index 00000000..c5facd3e --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/data/gestures.json @@ -0,0 +1,434 @@ +{ + "Thumbs Up": [ + [ + [ + 55.79, + 33.49, + 71.73 + ], + [ + 32.97, + 6.29, + 62.17 + ], + [ + 18.19, + -23.25, + 54.63 + ], + [ + 6.33, + -44.55, + 38.9 + ], + [ + 1.31, + -68.49, + 21.19 + ], + [ + -1.3, + -18.8, + 5.97 + ], + [ + -21.11, + -8.72, + 10.04 + ], + [ + -24.52, + -4.43, + 36.7 + ], + [ + -16.02, + -5.04, + 62.18 + ], + [ + -0.54, + -2.62, + -2.17 + ], + [ + -29.21, + 5.53, + 3.8 + ], + [ + -24.45, + 15.7, + 32.81 + ], + [ + -15.57, + 8.49, + 54.27 + ], + [ + 0.38, + 11.75, + -3.95 + ], + [ + -21.88, + 24.68, + 1.97 + ], + [ + -17.79, + 28.22, + 29.03 + ], + [ + -8.23, + 22.2, + 47.44 + ], + [ + 3.08, + 36.58, + 4.13 + ], + [ + -16.22, + 40.8, + 10.86 + ], + [ + -13.68, + 42.82, + 32.44 + ], + [ + -3.32, + 38.39, + 44.07 + ] + ], + [ + [ + -56.65, + 28.66, + 76.81 + ], + [ + -31.7, + -5.11, + 63.08 + ], + [ + -15.86, + -29.1, + 53.03 + ], + [ + -4.81, + -55.9, + 31.03 + ], + [ + -1.37, + -81.69, + 11.31 + ], + [ + -0.5, + -26.99, + 2.45 + ], + [ + 28.73, + -17.93, + 3.78 + ], + [ + 32.44, + -13.91, + 33.41 + ], + [ + 11.29, + -16.61, + 55.31 + ], + [ + -1.39, + -5.34, + -1.92 + ], + [ + 34.84, + -1.06, + -0.65 + ], + [ + 27.51, + -0.95, + 26.83 + ], + [ + 12.73, + -6.29, + 49.37 + ], + [ + 2.19, + 17.04, + -1.81 + ], + [ + 30.69, + 18.37, + 5.88 + ], + [ + 25.95, + 15.72, + 29.87 + ], + [ + 14.22, + 13.71, + 50.57 + ], + [ + -1.75, + 33.09, + 8.38 + ], + [ + 21.82, + 33.21, + 13.19 + ], + [ + 20.28, + 29.5, + 34.52 + ], + [ + 2.37, + 27.91, + 47.24 + ] + ] + ], + "Peace": [ + [ + [ + 22.39, + 87.28, + -1.85 + ], + [ + -11.71, + 68.05, + -7.0 + ], + [ + -26.49, + 40.28, + -16.81 + ], + [ + -26.23, + 7.6, + -32.23 + ], + [ + -3.57, + -15.22, + -38.57 + ], + [ + -27.93, + 3.5, + 9.23 + ], + [ + -41.67, + -23.38, + 3.37 + ], + [ + -53.21, + -43.14, + -1.92 + ], + [ + -64.38, + -59.04, + -22.75 + ], + [ + -4.21, + -3.56, + 6.41 + ], + [ + -10.19, + -44.6, + -1.5 + ], + [ + -18.83, + -66.01, + -14.96 + ], + [ + -27.4, + -88.46, + -29.99 + ], + [ + 18.22, + -3.06, + -5.37 + ], + [ + 5.56, + -18.81, + -27.92 + ], + [ + -4.45, + -0.2, + -41.83 + ], + [ + -6.99, + 23.08, + -48.86 + ], + [ + 33.4, + 10.21, + -16.24 + ], + [ + 24.38, + -3.56, + -32.57 + ], + [ + 6.43, + 3.8, + -50.22 + ], + [ + -2.96, + 20.02, + -58.27 + ] + ], + [ + [ + -19.06, + 91.4, + 2.35 + ], + [ + 13.72, + 68.5, + -11.06 + ], + [ + 20.57, + 41.16, + -30.22 + ], + [ + 8.65, + 11.33, + -49.35 + ], + [ + -14.14, + -14.03, + -49.96 + ], + [ + 26.0, + -2.86, + 2.2 + ], + [ + 35.24, + -31.08, + -0.1 + ], + [ + 38.2, + -53.43, + -5.46 + ], + [ + 40.13, + -70.8, + -24.25 + ], + [ + 2.7, + -4.95, + 6.84 + ], + [ + 3.5, + -44.92, + 0.06 + ], + [ + 3.22, + -66.45, + -16.11 + ], + [ + 6.59, + -88.17, + -29.47 + ], + [ + -18.17, + 0.41, + -0.44 + ], + [ + -14.47, + -18.74, + -28.59 + ], + [ + -7.92, + 0.55, + -42.1 + ], + [ + -3.94, + 20.2, + -40.95 + ], + [ + -37.31, + 15.41, + -10.47 + ], + [ + -27.21, + 7.9, + -29.76 + ], + [ + -15.62, + 21.68, + -39.9 + ], + [ + -18.23, + 37.6, + -32.57 + ] + ] + ] +} \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/hand/docs/images/adding_poses.png b/devices/webcam/pose_vis/gesture/hand/docs/images/adding_poses.png new file mode 100644 index 00000000..5d9a8e40 Binary files /dev/null and b/devices/webcam/pose_vis/gesture/hand/docs/images/adding_poses.png differ diff --git a/devices/webcam/pose_vis/gesture/hand/docs/images/collecting_data.png b/devices/webcam/pose_vis/gesture/hand/docs/images/collecting_data.png new file mode 100644 index 00000000..67ec190f Binary files /dev/null and b/devices/webcam/pose_vis/gesture/hand/docs/images/collecting_data.png differ diff --git a/devices/webcam/pose_vis/gesture/hand/gesture_vis.py b/devices/webcam/pose_vis/gesture/hand/gesture_vis.py new file mode 100644 index 00000000..b7df92fd --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/gesture_vis.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +import os +if os.name == "nt": + # Improve sleep timer resolution for this process on Windows + # https://learn.microsoft.com/en-us/windows/win32/api/timeapi/nf-timeapi-timebeginperiod + import ctypes + winmm = ctypes.WinDLL('winmm') + winmm.timeBeginPeriod(1) + + # Improve device capture startup time on Windows + # https://github.com/opencv/opencv/issues/17687 + os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" + +import time +import logging +import cv2 +import collections +import numpy as np +import argparse as ap + +from enum import Enum +from typing import List, Tuple, Any +from google.protobuf.json_format import MessageToDict +from pose_vis.utils import parse_sources, parse_resolutions +from pose_vis.utils import absolute_path +from pose_vis.streams.utils.capture_handler import CaptureHandler, AllCapturesFinished +from pose_vis.display import DisplayHandler +from pose_vis.extensions.hands import HandsExtension, HandsConfig +from pose_vis.performance_utility import PerfUtility +from pose_vis.gesture.hand.annotation import Annotation, Vector + +logger = logging.getLogger(__name__) + +# The maximum difference value to check for when looking for a known pose +MAX_DIFFERENCE_VALUE = 450 + +class GV_MODE(Enum): + VISUALIZATION = 0, + LABEL_INPUT = 1, + COLLECTION = 2 + +class GestureVis(): + """ + Runs hand tracking and gesture recognition for provided sources + """ + sources: List[str | int] + resolutions: List[Tuple[int, int, int]] + cap_handler: CaptureHandler + dis_handler: DisplayHandler + perf: PerfUtility + annotation: Annotation + data_dir: str + export_files: List[str] + export_format: str + running: bool = True + mode: GV_MODE = GV_MODE.VISUALIZATION + hand_labels: List[str] + hand_bounds: List[List[int]] + label_name: str = "" + label_names: List[str] = [] + video_writers: List[cv2.VideoWriter] + + def __init__(self, sources: List[str | int], resolutions: List[Tuple[int, int, int]], data_dir: str, export_files: List[str], export_format: str) -> None: + self.sources = sources + self.resolutions = resolutions + self.data_dir = data_dir + self.annotation = Annotation() + self.export_files = export_files + self.export_format = export_format + self.video_writers = [] + for i in range(len(export_files)): + self.video_writers.append(cv2.VideoWriter(self.export_files[i], cv2.VideoWriter_fourcc(*self.export_format), self.resolutions[i][2], (self.resolutions[i][0], self.resolutions[i][1]))) + self.annotation.load_gestures(os.path.join(self.data_dir, "gestures.json")) + + def on_key(self, key: int) -> None: + """ + Input handling, connected to `DisplayHandler->register_key_callback` + + Switches states based on `GV_MODE` + """ + if self.mode == GV_MODE.VISUALIZATION and key == 13: + self.mode = GV_MODE.LABEL_INPUT + elif self.mode == GV_MODE.LABEL_INPUT: + if key == 27: + self.mode = GV_MODE.VISUALIZATION + self.label_name = "" + elif key == 8: + self.label_name = self.label_name[:-1] + elif key == 13: + if len(self.label_name) == 0: + logger.warning(" label is empty") + else: + if self.label_name not in self.label_names: + self.label_names.append(self.label_name) + self.mode = GV_MODE.COLLECTION + else: + character = chr(key) + self.label_name += character + elif self.mode == GV_MODE.COLLECTION: + if key == 27: + self.mode = GV_MODE.VISUALIZATION + self.label_name = "" + elif key == 32: + for hdx in range(len(self.annotation.hands)): + self.annotation.add_gesture_data(self.annotation.hands[hdx], self.label_name) + self.annotation.save_gestures(os.path.join(self.data_dir, "gestures.json")) + elif key == 27: + self.running = False + + def get_handedness_labels(self, mp_handedness: Any) -> List[str]: + """ + Puts MediaPipe "handedness" labels into a simple list + https://google.github.io/mediapipe/solutions/hands.html#multi_handedness + """ + hand_labels: List[str] = [None] * len(mp_handedness) + for label_index, classification in enumerate(mp_handedness): + _dict = MessageToDict(classification)["classification"][0] + hand_labels[label_index] = _dict["label"] + return hand_labels + + def get_bounds_data(self, mp_screen_keypoints: Any, mp_world_keypoints: Any, frame: np.ndarray) -> Tuple[List[List[int]], List[np.ndarray]]: + """ + Gets the screen bounds and vertices for each hand + """ + im_width, im_height = frame.shape[1], frame.shape[0] + num_hands = len(mp_screen_keypoints) + hand_bounds: List[List[int]] = [None] * num_hands + hand_vectors: List[np.ndarray] = [] + for hand_index, landmark_list_screen in enumerate(mp_screen_keypoints): + landmark_list_screen = landmark_list_screen.landmark + landmark_list_world = mp_world_keypoints[hand_index].landmark + + bounds_array = np.empty((0, 2), int) + for landmark in landmark_list_screen: + lx = landmark.x + ly = landmark.y + + bx = min(int(lx * im_width), im_width - 1) + by = min(int(ly * im_height), im_height - 1) + point = [np.array((bx, by))] + bounds_array = np.append(bounds_array, point, axis = 0) + + hand_vector = np.empty(Vector.shape) + for landmark in landmark_list_world: + # We're multiplying by 1000 here to convert meters into milimeters + hand_vector = np.append(hand_vector, [[round(landmark.x * 1000.0, 2), round(landmark.y * 1000.0, 2), round(landmark.z * 1000.0, 2)]], axis=0) + hand_vectors.append(hand_vector) + + x, y, w, h = cv2.boundingRect(bounds_array) + hand_bounds[hand_index] = [x, y, x + w, y + h] + return (hand_bounds, hand_vectors) + + def draw_hand_annotations(self, source_index: int, frame: np.ndarray) -> None: + """ + Draws the pose annotation label with the lowest `difference` value + """ + if self.mode != GV_MODE.VISUALIZATION: + return + + annotations = self.annotation.guess_annotations(MAX_DIFFERENCE_VALUE) + for hdx, ann in enumerate(annotations): + label = "?" if len(ann[0]) == 0 else ann[0] + bounds = self.hand_bounds[hdx] + annotation_str = f"{self.hand_labels[hdx]}: {label} ({ann[1]:.2f})" + text_size = cv2.getTextSize(annotation_str, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1) + cv2.rectangle(frame, (bounds[0], bounds[1]), (bounds[0] + text_size[0][0] + 2, bounds[1] - text_size[0][1] - 2), (0, 0, 0), -1) + cv2.putText(frame, annotation_str, (bounds[0] + 1, bounds[1] - 1), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA) + + if len(self.video_writers) > 0: + self.video_writers[source_index].write(frame) + + def run(self) -> None: + """ + Run GestureVis + """ + # MediaPipe Hands parameters can be found here + # https://google.github.io/mediapipe/solutions/hands.html#static_image_mode + hands_config = HandsConfig(model_complexity = 1) + + self.cap_handler = CaptureHandler(self.sources, self.resolutions, [HandsExtension(hands_config)]) + self.dis_handler = DisplayHandler(50, {"HandsExtension": HandsExtension}) + self.perf = PerfUtility() + + self.dis_handler.register_key_callback(self.on_key) + self.dis_handler.register_post_render_callback(self.draw_hand_annotations) + self.cap_handler.start_workers() + num_sources = len(self.sources) + self.annotation_infos = collections.deque(maxlen = num_sources) + + while self.running: + self.perf.update_start() + + results = None + try: + results = self.cap_handler.get_captures() + except AllCapturesFinished: + self.running = False + logger.info(" capture sources have finished playing, exiting") + continue + captures = [results[i][0] for i in range(num_sources)] + extensions = [results[i][1] for i in range(num_sources)] + + for i in range(num_sources): + mp_screen_keypoints = extensions[i]["HandsExtension"]["multi_hand_landmarks"] + mp_world_keypoints = extensions[i]["HandsExtension"]["multi_hand_world_landmarks"] + mp_handedness = extensions[i]["HandsExtension"]["multi_handedness"] + + self.hand_labels = self.get_handedness_labels(mp_handedness) + capture = captures[i] + hand_bounds, hand_vectors = self.get_bounds_data(mp_screen_keypoints, mp_world_keypoints, capture.frame) + self.hand_bounds = hand_bounds + self.annotation.clear_hand_vertices() + for vdx, vec in enumerate(hand_vectors): + self.annotation.set_hand_vertices(vdx, vec) + + if self.mode == GV_MODE.LABEL_INPUT: + cv2.putText(capture.frame, "Define or select label: press to exit", (10, 24), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, f"Specify label name: {self.label_name}_", (10, 24 + (18 * 1)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, f"Defined labels: {self.label_names}", (10, 24 + (18 * 2)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, "Press to confirm", (10, 24 + (18 * 3)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + elif self.mode == GV_MODE.COLLECTION: + cv2.putText(capture.frame, "Collect data points: press to exit", (10, 24), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + label_index = self.label_names.index(self.label_name) + cv2.putText(capture.frame, f"Label: {self.label_name}, index: {label_index}", (10, 24 + (18 * 1)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + num_points = len(self.annotation.gestures[self.label_name]) if self.label_name in self.annotation.gestures else 0 + cv2.putText(capture.frame, f"Data points: {num_points}", (10, 24 + (18 * 2)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, "Press to collect data point", (10, 24 + (18 * 3)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + + self.dis_handler.update_frames(captures, extensions) + self.dis_handler.update_windows(0) + + time.sleep(self.perf.get_remaining_sleep_time(self.resolutions[0][2])) + self.perf.update_end() + self.cleanup() + + def cleanup(self) -> None: + self.cap_handler.cleanup() + self.dis_handler.cleanup() + for writer in self.video_writers: + writer.release() + +parser = ap.ArgumentParser() +parser.add_argument("--sources", type = str, nargs = "*", help = "which sources to stream (url, device id, video, or image directory)", action = "store", required = False) +parser.add_argument("--resolutions", type = str, nargs = "*", help = "specify resolution/framerate per stream; format is :xx (default *:1280x720x30)", action = "store", required = False) +default_dir = f"webcam{os.sep}pose_vis{os.sep}gesture{os.sep}hand{os.sep}data" +parser.add_argument("--data-dir", type = str, nargs = "?", const = default_dir, default = default_dir, help = f"set data directory (default: {default_dir})", action = "store", required = False) +parser.add_argument("--export", type = str, nargs = "*", help = "export annotated stream as video file", action = "store", required = False) +parser.add_argument("--export-format", type = str, nargs = "?", const = "MP4V", default = "MP4V", help = "format to write exported video in (default: H264)", action = "store", required = False) + +if __name__ == "__main__": + args = parser.parse_args() + + sources = parse_sources(args.sources) + resolutions = parse_resolutions(len(sources), args.resolutions if args.resolutions is not None else []) + export_files = [] + if args.export is not None: + for _file in args.export: + export_files.append(absolute_path(_file)) + GestureVis(sources, resolutions, absolute_path(args.data_dir), export_files, args.export_format).run() \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/hand/input_example.ipynb b/devices/webcam/pose_vis/gesture/hand/input_example.ipynb new file mode 100644 index 00000000..9903b177 --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/input_example.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Taking a Look at GestureVis\n", + "\n", + "In GestureVis, `annotation.py` handles the actual annotation and only depends on Numpy and Matplotlib. It tracks hand data via a skeleton in 3D space, where each vertex is a collection of X, Y, and Z coordinates. The `gesture_vis.py` script handles running PoseVis to collect and convert MediaPipe keypoints into a format that the annotator expects.\n", + "The data in this notebook is generated through MediaPipe, but it could easily be from any other hand tracking solution.\n", + "\n", + "MediaPipe generates a 21-point skeleton:\n", + "\n", + "![skeleton](https://mediapipe.dev/images/mobile/hand_landmarks.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initializing the Annotator\n", + "\n", + "Let's create the annotator object and load the included gestures:\n", + "See the [readme](readme.md) for information on how to easily add gestures to `gestures.json`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from pose_vis.utils import absolute_path\n", + "from pose_vis.gesture.hand.annotation import Annotation, Vector\n", + "\n", + "annotation = Annotation()\n", + "# Two gestures are included in data/gestures.json: \"Peace\" and \"Thumbs Up\"\n", + "annotation.load_gestures(absolute_path(\"hand/data/gestures.json\"))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Adding Hand Vertices\n", + "\n", + "The included gestures are a 21-point list of vertices, and the annotator expects the hands to be as well. It's hard to do this manually, so I recorded two poses with `gesture_vis.py`, and we'll load that data. All units are in milimeters, with their origin being the approximate geometric center of the hand. Find out more at the [MediaPipe documentation](https://google.github.io/mediapipe/solutions/hands.html#multi_hand_world_landmarks)." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Thumbs up approximation\n", + "pose_1 = \"\"\"\n", + "[\n", + " [\n", + " 37.49,\n", + " 43.01,\n", + " 78.26\n", + " ],\n", + " [\n", + " 11.91,\n", + " 12.52,\n", + " 69.02\n", + " ],\n", + " [\n", + " 4.26,\n", + " -12.75,\n", + " 62.93\n", + " ],\n", + " [\n", + " -1.0,\n", + " -36.02,\n", + " 44.17\n", + " ],\n", + " [\n", + " 0.84,\n", + " -60.46,\n", + " 24.33\n", + " ],\n", + " [\n", + " -6.4,\n", + " -14.37,\n", + " 8.72\n", + " ],\n", + " [\n", + " -25.42,\n", + " -6.54,\n", + " 8.0\n", + " ],\n", + " [\n", + " -25.98,\n", + " -4.96,\n", + " 34.61\n", + " ],\n", + " [\n", + " -16.45,\n", + " -3.8,\n", + " 66.46\n", + " ],\n", + " [\n", + " -0.36,\n", + " -3.3,\n", + " -1.3\n", + " ],\n", + " [\n", + " -24.56,\n", + " 2.77,\n", + " -4.13\n", + " ],\n", + " [\n", + " -26.39,\n", + " 13.06,\n", + " 25.06\n", + " ],\n", + " [\n", + " -14.63,\n", + " 3.57,\n", + " 49.25\n", + " ],\n", + " [\n", + " 2.75,\n", + " 7.73,\n", + " -7.43\n", + " ],\n", + " [\n", + " -19.48,\n", + " 20.25,\n", + " -7.13\n", + " ],\n", + " [\n", + " -17.46,\n", + " 22.93,\n", + " 20.25\n", + " ],\n", + " [\n", + " -3.18,\n", + " 16.78,\n", + " 40.69\n", + " ],\n", + " [\n", + " 4.67,\n", + " 31.13,\n", + " -2.08\n", + " ],\n", + " [\n", + " -12.25,\n", + " 29.92,\n", + " 2.64\n", + " ],\n", + " [\n", + " -14.05,\n", + " 33.27,\n", + " 24.07\n", + " ],\n", + " [\n", + " -3.9,\n", + " 30.11,\n", + " 37.32\n", + " ]\n", + "]\n", + "\"\"\"\n", + "\n", + "# Peace sign approximation\n", + "pose_2 = \"\"\"\n", + "[\n", + " [\n", + " 24.62,\n", + " 87.31,\n", + " -20.79\n", + " ],\n", + " [\n", + " -8.73,\n", + " 69.12,\n", + " -31.87\n", + " ],\n", + " [\n", + " -20.5,\n", + " 40.68,\n", + " -40.55\n", + " ],\n", + " [\n", + " -15.52,\n", + " 6.57,\n", + " -47.47\n", + " ],\n", + " [\n", + " 8.14,\n", + " -18.77,\n", + " -39.59\n", + " ],\n", + " [\n", + " -30.21,\n", + " 6.34,\n", + " 0.8\n", + " ],\n", + " [\n", + " -42.02,\n", + " -23.56,\n", + " -2.07\n", + " ],\n", + " [\n", + " -54.18,\n", + " -46.19,\n", + " -6.31\n", + " ],\n", + " [\n", + " -63.11,\n", + " -61.07,\n", + " -25.07\n", + " ],\n", + " [\n", + " -5.75,\n", + " -2.41,\n", + " 6.32\n", + " ],\n", + " [\n", + " -14.64,\n", + " -45.58,\n", + " 4.01\n", + " ],\n", + " [\n", + " -27.03,\n", + " -67.12,\n", + " -9.93\n", + " ],\n", + " [\n", + " -35.28,\n", + " -89.42,\n", + " -22.82\n", + " ],\n", + " [\n", + " 18.98,\n", + " -5.47,\n", + " 0.61\n", + " ],\n", + " [\n", + " 12.47,\n", + " -23.93,\n", + " -22.09\n", + " ],\n", + " [\n", + " 6.73,\n", + " -3.04,\n", + " -36.35\n", + " ],\n", + " [\n", + " 6.22,\n", + " 22.73,\n", + " -37.9\n", + " ],\n", + " [\n", + " 38.65,\n", + " 5.67,\n", + " -8.14\n", + " ],\n", + " [\n", + " 32.73,\n", + " -7.24,\n", + " -23.63\n", + " ],\n", + " [\n", + " 21.98,\n", + " 5.87,\n", + " -37.88\n", + " ],\n", + " [\n", + " 22.24,\n", + " 24.88,\n", + " -34.55\n", + " ]\n", + "]\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import numpy as np\n", + "\n", + "# We'll set the approximate \"thumbs up\" as hand 0\n", + "annotation.set_hand_vertices(0, np.asarray(json.loads(pose_1)))\n", + "\n", + "# and the approximate \"peace sign\" as hand 1\n", + "annotation.set_hand_vertices(1, np.asarray(json.loads(pose_2)))\n", + "\n", + "# When using annotation for videos and real time streaming, you'd use annotation.set_hand_vertices() to update each hand's vertices once per frame\n", + "# while using annotation.clear_hand_vertices() beforehand if needed" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Guessing Gestures\n", + "\n", + "The annotator compares its current hand indices to the saved gesture indices, calculates a difference value (always positive), and will label each hand with the appropriate annotation label if the difference value is below the configured value." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Thumbs Up', 332.7499999999999), ('Peace', 394.76000000000005)]\n" + ] + } + ], + "source": [ + "from typing import List, Tuple\n", + "\n", + "# We'll use a max difference value of 450, as we're working in milimeters\n", + "# A list of labels is returned, where each index corresponds to a hand index\n", + "# The values in the list are the label name, and the calculated difference value\n", + "# If no gesture is found, the value is an empty string and the closest difference value\n", + "labels: List[Tuple[str, float]] = annotation.guess_annotations(max_difference_value=450)\n", + "\n", + "print(labels)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting Data\n", + "\n", + "The annotator can also plot its hand data with Matplotlib:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = annotation.configure_plot()\n", + "\n", + "# This is the order to draw and connect vertices in. Take a look at the keypoint ordering in the first cell to get an idea of how this works\n", + "drawing_order = [[0, 1, 2, 3, 4], [0, 5, 6, 7, 8], [5, 9, 10, 11, 12], [9, 13, 14, 15, 16], [13, 17, 18, 19, 20], [17, 0]]\n", + "# This is size for the 3D scene\n", + "bounds = (-100, 100)\n", + "# We need to reorder how the X, Y, and Z variables are used to convert Z to up and Y to forward\n", + "xyz_order = (0, 2, 1)\n", + "# And finally, we need to flip the new forward coordinate as it's backwards\n", + "xyz_scale = (1, 1, -1)\n", + "annotation.plot_hand(0, ax, drawing_order, bounds, xyz_order=xyz_order, xyz_scale=xyz_scale)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# And the second pose\n", + "fig, ax = annotation.configure_plot()\n", + "annotation.plot_hand(1, ax, drawing_order, bounds, xyz_order=xyz_order, xyz_scale=xyz_scale)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "23a593576959775a19d6469cad78770ba03ee1b7699646fbac2d14539ad9dcf0" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/devices/webcam/pose_vis/gesture/hand/readme.md b/devices/webcam/pose_vis/gesture/hand/readme.md new file mode 100644 index 00000000..a63ddf3d --- /dev/null +++ b/devices/webcam/pose_vis/gesture/hand/readme.md @@ -0,0 +1,55 @@ +# Simple Heuristic Gesture Annotation + +[Preview Video](https://i.imgur.com/Le4mvmY.mp4) + +This script takes the the [world hand landmarks](https://google.github.io/mediapipe/solutions/hands.html#multi_hand_world_landmarks) generated by MediaPipe, compares the current frame's data to a known list of poses, and estimates the best fit. + +![Hand Landmarks](https://mediapipe.dev/images/mobile/hand_landmarks.png) + +For estimation, a "difference" value is estimated by comparing each unknown pose's keypoint distance to known pose keypoints, the lowest "difference" value wins. + +There's a few caveats and room for improvement: the estimation algorithm will always check every pose and rank them via sorting by the lowest "difference" value, every frame. Distances seem to change based on where the hand is in the frame, but this can be alleviated by having multiple data points for a particular pose. Some poses may benefit from finer tuned estimation, such as ignoring directional tracking. + +## Data Example +An example notebook showing how the hand tracking data is used and represented can be found [here](input_example.ipynb). + +## Running + +Install [PoseVis](https://github.com/Dasfaust/labgraph/blob/hand_tracking/devices/webcam/readme.md) + +Check command line arguments: +``` +python -m pose_vis.gesture.hand.gesture_vis --help +``` + +Run with: +``` +python -m pose_vis.gesture.hand.gesture_vis --sources 0 +``` + +Exporting videos: +``` +python -m pose_vis.gesture.hand.gesture_vis --sources test_video.mp4 --resolutions *:1920x1080x30 --export test_video_annotated.mp4 +``` + +You may specify a codec with: +``` +python -m pose_vis.gesture.hand.gesture_vis ... --export-format h264 +``` +Codec codes can be found [here](https://learn.microsoft.com/en-us/windows/win32/medfound/video-fourccs). + +## Adding Poses + +When the script is running, press `Enter` to add poses: (note: this has only been tested on Windows, CV2 keycodes may differ between platforms.) + +![Adding Poses](https://github.com/Dasfaust/labgraph/blob/hand_tracking/devices/webcam/pose_vis/gesture/hand/docs/images/adding_poses.png) + +Enter a label for this pose and press `Enter` to continue, or `Escape` to exit. In this example, we're adding the label "OK". + +![Collecting Data](https://github.com/Dasfaust/labgraph/blob/hand_tracking/devices/webcam/pose_vis/gesture/hand/docs/images/collecting_data.png) + +Position your hand into the desired pose, and press `Spacebar` to collect a data point. Press `Escape` when finished, and test your pose. + +[Result Preview](https://i.imgur.com/1VnVMlL.mp4) + +Pose data, by default, is stored in `pose_vis/gesture/hand/data`, it consists of `labels.json` which are the label names, and a series of `.npy` files which is the recorded pose data that corresponds to each index in the labels array. \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/pose/__init__.py b/devices/webcam/pose_vis/gesture/pose/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/gesture/pose/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/gesture/pose/pose_gesture_vis.py b/devices/webcam/pose_vis/gesture/pose/pose_gesture_vis.py new file mode 100644 index 00000000..028981cf --- /dev/null +++ b/devices/webcam/pose_vis/gesture/pose/pose_gesture_vis.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +# from types import Union #! correct import? +import os +if os.name == "nt": + # Improve sleep timer resolution for this process on Windows + # https://learn.microsoft.com/en-us/windows/win32/api/timeapi/nf-timeapi-timebeginperiod + import ctypes + winmm = ctypes.WinDLL('winmm') + winmm.timeBeginPeriod(1) + + # Improve device capture startup time on Windows + # https://github.com/opencv/opencv/issues/17687 + os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" + +import time +import json +import logging +import cv2 +import collections +import numpy as np +import argparse as ap +import math + +from pathlib import Path +from enum import Enum +from typing import List, Tuple, Any, Deque, Union +from dataclasses import dataclass +from google.protobuf.json_format import MessageToDict +from pose_vis.utils import parse_sources, parse_resolutions +from pose_vis.utils import absolute_path +from pose_vis.streams.utils.capture_handler import CaptureHandler, AllCapturesFinished +from pose_vis.display import DisplayHandler +# from pose_vis.extensions.hands import HandsExtension, HandsConfig, mp_hands +from pose_vis.extensions.pose import PoseExtension, PoseConfig, mp_pose +from pose_vis.performance_utility import PerfUtility + +logger = logging.getLogger(__name__) + +LANDMARK_DISTANCES = [ + (mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.LEFT_ELBOW ), + (mp_pose.PoseLandmark.LEFT_WRIST, mp_pose.PoseLandmark.LEFT_WRIST ), + # (mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.LEFT_ELBOW ), + # (mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.LEFT_HIP ), + # (mp_pose.PoseLandmark.LEFT_ELBOW, mp_pose.PoseLandmark.LEFT_WRIST ), + # ! Maybe add from wrist to thumb/index finger + + (mp_pose.PoseLandmark.RIGHT_SHOULDER, mp_pose.PoseLandmark.RIGHT_ELBOW), + (mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.RIGHT_WRIST), + # (mp_pose.PoseLandmark.RIGHT_SHOULDER, mp_pose.PoseLandmark.RIGHT_ELBOW), + # (mp_pose.PoseLandmark.RIGHT_SHOULDER, mp_pose.PoseLandmark.RIGHT_HIP ), + # (mp_pose.PoseLandmark.RIGHT_ELBOW, mp_pose.PoseLandmark.RIGHT_WRIST ), + + + + (mp_pose.PoseLandmark.LEFT_HIP, mp_pose.PoseLandmark.LEFT_KNEE ), + (mp_pose.PoseLandmark.LEFT_KNEE, mp_pose.PoseLandmark.LEFT_ANKLE ), + # (mp_pose.PoseLandmark.LEFT_HIP, mp_pose.PoseLandmark.LEFT_KNEE ), + # (mp_pose.PoseLandmark.LEFT_KNEE, mp_pose.PoseLandmark.LEFT_ANKLE ), + # ! Maybe distance from ankle to toe + + (mp_pose.PoseLandmark.RIGHT_HIP, mp_pose.PoseLandmark.RIGHT_KNEE), + (mp_pose.PoseLandmark.RIGHT_KNEE, mp_pose.PoseLandmark.RIGHT_ANKLE ) + # (mp_pose.PoseLandmark.RIGHT_HIP, mp_pose.PoseLandmark.RIGHT_KNEE ), + # (mp_pose.PoseLandmark.RIGHT_KNEE, mp_pose.PoseLandmark.RIGHT_ANKLE ) +] + +LANDMARK_DIRECTIONS = [ + # todo --- ADD HERE --- + (mp_pose.PoseLandmark.RIGHT_WRIST, mp_pose.PoseLandmark.RIGHT_SHOULDER), + (mp_pose.PoseLandmark.LEFT_WRIST, mp_pose.PoseLandmark.LEFT_SHOULDER), + (mp_pose.PoseLandmark.RIGHT_ANKLE, mp_pose.PoseLandmark.RIGHT_HIP), + (mp_pose.PoseLandmark.LEFT_ANKLE, mp_pose.PoseLandmark.LEFT_HIP), +] + +TORSO_DISTANCE = [ + (mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.RIGHT_SHOULDER), + (mp_pose.PoseLandmark.LEFT_HIP, mp_pose.PoseLandmark.RIGHT_HIP), + (mp_pose.PoseLandmark.LEFT_SHOULDER, mp_pose.PoseLandmark.LEFT_HIP), + (mp_pose.PoseLandmark.RIGHT_SHOULDER, mp_pose.PoseLandmark.RIGHT_HIP) +] + +DRAW_DEBUG = False + +MAX_DIFFERENCE_VALUE = 3 #! play around with this value + + +class GV_MODE(Enum): + VISUALIZATION = 0, + LABEL_INPUT = 1, + COLLECTION = 2 + +#! revisit this class +@dataclass +class AnnotationInfo(): + pose_labels: List[str] + pose_bounds: List[List[int]] + gesture_data: List[np.ndarray] + draw: bool + +class PoseGestureVis(): + + sources: List[Union[str , int]] + # sources: List[str | int] + resolutions: List[Tuple[int, int, int]] + cap_handler: CaptureHandler + dis_handler: DisplayHandler + perf: PerfUtility + annotation_infos: Deque[AnnotationInfo] + data_dir: str + export_files: List[str] + export_format: str + running: bool = True + mode: GV_MODE = GV_MODE.VISUALIZATION + pose_bound: list[list[int]] + label_name: str = "" + label_names: List[str] = [] + label_data: List[np.ndarray] = [] + video_writers: List[cv2.VideoWriter] + + # ! Test this + def __init__(self, sources: List[Union[str , int]], resolutions: List[Tuple[int, int, int]], data_dir: str, export_files: List[str], export_format: str) -> None: + self.sources = sources + self.resolutions = resolutions + self.data_dir = data_dir + self.export_files = export_files + self.export_format = export_format + self.video_writers = [] + for i in range(len(export_files)): + self.video_writers.append(cv2.VideoWriter(self.export_files[i], cv2.VideoWriter_fourcc(*self.export_format), self.resolutions[i][2], (self.resolutions[i][0], self.resolutions[i][1]))) + self.load_data() + + def load_data(self): + label_names = os.path.join(self.data_dir, "labels.json") + + if os.path.exists(label_names): + with open(label_names, 'r') as _file: + self.label_names = json.load(_file) + + np_files = [_file for _file in os.listdir(self.data_dir) if _file.endswith('.npy')] + self.label_data = [np.empty(shape=(0, len(LANDMARK_DISTANCES) + (len(LANDMARK_DIRECTIONS) * 3)), dtype=np.float32)] * len(label_names) #! Go over this + + for _file in np_files: + index = int(Path(_file).stem) + self.label_data[index] = np.load(os.path.join(self.data_sir, _file)) + + + def on_key(self, key: int): + if self.mode == GV_MODE.VISUALIZATION and key == 13: + self.mode = GV_MODE.LABEL_INPUT + elif self.mode == GV_MODE.LABEL_INPUT: + if key == 27: + self.mode = GV_MODE.VISUALIZATION + self.label_name = "" + elif key == 8: + self.label_name = self.label_name[:-1] + elif key == 13: + if len(self.label_name) == 0: + logger.warning(" label is empty") + else: + if self.label_name not in self.label_names: + self.label_names.append(self.label_name) + self.label_data.append(np.ndarray(shape = (0, len(LANDMARK_DISTANCES) + (len(LANDMARK_DIRECTIONS) * 3)), dtype = np.float32)) + self.mode = GV_MODE.COLLECTION + else: + character = chr(key) + self.label_name += character + elif self.mode == GV_MODE.COLLECTION: + if key == 27: + self.mode = GV_MODE.VISUALIZATION + self.label_name = "" + elif key == 32: + self.save_gesture_keypoints() + elif key == 27: + self.running = False + + + + def get_labels(self, mp_lables): + print(f'labels - {mp_lables}') + + + + def get_bound_data(self, mp_screen_keypoints, mp_world_keypoints, frame:np.ndarray): + im_width, im_height = frame.shape[1], frame.shape[0] + gesture_data: List[np.array] = [np.empty(shape = (len(LANDMARK_DISTANCES) + (len(LANDMARK_DIRECTIONS) * 3)), dtype = np.float32)] + print(f'key - {mp_screen_keypoints}') + + + #! fix bugs here + for pose_index, landmark_list_screen in enumerate(mp_screen_keypoints): + landmark_list_screen = landmark_list_screen.landmark + landmark_list_world = mp_world_keypoints[pose_index].landmark + bounds_array = np.empty((0, 2), int) + gesture_distances = np.empty(shape = (len(LANDMARK_DISTANCES) + (len(LANDMARK_DIRECTIONS) * 3)), dtype = np.float32) + for landmark in landmark_list_screen: + lx = landmark.x + ly = landmark.y + + bx = min(int(lx * im_width), im_width - 1) + by = min(int(ly * im_height), im_height - 1) + point = [np.array((bx, by))] + bounds_array = np.append(bounds_array, point, axis = 0) + + palm_size = 0.0 + for landmark_ids in TORSO_DISTANCE: + lid1 = landmark_ids[0] + lid2 = landmark_ids[1] + landmark1 = landmark_list_world[lid1] + landmark2 = landmark_list_world[lid2] + palm_size += math.dist((landmark1.x, landmark1.y, landmark1.z), (landmark2.x, landmark2.y, landmark2.z)) + palm_size = palm_size / len(TORSO_DISTANCE) + + for ddx, landmark_ids in enumerate(LANDMARK_DISTANCES): + lid1 = landmark_ids[0] + lid2 = landmark_ids[1] + landmark1 = landmark_list_world[lid1] + landmark2 = landmark_list_world[lid2] + + dist = math.dist((landmark1.x, landmark1.y, landmark1.z), (landmark2.x, landmark2.y, landmark2.z)) / palm_size + gesture_distances[ddx] = dist + + dir_index = 0 + for landmark_ids in LANDMARK_DIRECTIONS: + lid1 = landmark_ids[0] + lid2 = landmark_ids[1] + landmark1 = landmark_list_world[lid1] + landmark2 = landmark_list_world[lid2] + direction = np.asarray((landmark1.x - landmark2.x, landmark1.y - landmark2.y, landmark1.z - landmark2.z)) + direction = direction / np.linalg.norm(direction) + gesture_distances[len(LANDMARK_DISTANCES) + dir_index] = direction[0] + gesture_distances[len(LANDMARK_DISTANCES) + dir_index + 1] = direction[1] + gesture_distances[len(LANDMARK_DISTANCES) + dir_index + 2] = direction[2] + dir_index += 3 + + if DRAW_DEBUG: + for ddx, landmark_ids in enumerate(LANDMARK_DISTANCES): + lid1 = landmark_ids[0] + lid2 = landmark_ids[1] + landmark1 = landmark_list_screen[lid1] + landmark2 = landmark_list_screen[lid2] + + sx1 = min(int(landmark1.x * im_width), im_width - 1) + sy1 = min(int(landmark1.y * im_height), im_height - 1) + cv2.putText(frame, f"({lid1})", (sx1, sy1), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 1, cv2.LINE_AA) + sx2 = min(int(landmark2.x * im_width), im_width - 1) + sy2 = min(int(landmark2.y * im_height), im_height - 1) + cv2.putText(frame, f"({lid2})", (sx2, sy2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 1, cv2.LINE_AA) + + cv2.putText(frame, f"({gesture_distances[ddx]:.4f})", ((sx1 + sx2) // 2, (sy1 + sy2) // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 1, cv2.LINE_AA) + cv2.line(frame, (sx1, sy1), (sx2, sy2), (192, 192, 192), 1) + + gesture_data[pose_index] = gesture_distances + + x, y, w, h = cv2.boundingRect(bounds_array) + pose_bounds[pose_index] = [x, y, x + w, y + h] + return (pose_bounds, gesture_data) + + + + def guess_pose(self, source_index, pose_index): + differences = [] + + for label_id, pose in enumerate(self.label_data): + for i in range(np.ma.size(pose, axis=0)): + difference = 0.0 + for j in range(len(pose[i])): + difference += abs(self.annotation_infos[source_index].gesture_data[pose_index][j] - pose[i][j]) + differences.append((label_id, difference)) + differences.sort(key = lambda x: x[1]) + + return differences + + def draw_annotations(self, source_index: int, frame: np.ndarray): + ann_info = self.annotation_infos[source_index] + if not ann_info.draw: + return + + for pose_index, bounds in enumerate(ann_info.pose_bounds): + label = "?" if len(ann_info.pose_labels) <= pose_index else ann_info.pose_labels[pose_index] + classification = "?" + + differences = self.guess_pose(source_index, pose_index) + num_guesses = len(differences) + + if num_guesses > 0 and differences[0][1] <= MAX_DIFFERENCE_VALUE: + classification = self.label_names[differences[0][0]] + + diff_str = f" {differences[0][1]:.2f}" if num_guesses > 0 else "" + annotation = f"{label}: {classification}{diff_str}" + text_size = cv2.getTextSize(annotation, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 1) + cv2.rectangle(frame, (bounds[0], bounds[1]), (bounds[0] + text_size[0][0] + 2, bounds[1] - text_size[0][1] - 2), (0, 0, 0), -1) + cv2.putText(frame, annotation, (bounds[0] + 1, bounds[1] - 1), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA) + + + if len(self.video_writers) > 0: + self.video_writers[source_index].write(frame) + + #! TEST THIS + def save_pose_keypoints(self) -> None: + label_index = self.label_names.index(self.label_name) + for ann_info in self.annotation_infos: + for pose_keypoints in ann_info.gesture_data: + self.label_data[label_index] =np.append(self.label_data[label_index], [pose_keypoints], axis = 0) + + with open(os.path.join(self.data_dir, "label.json"), "w") as output: + output.write(json.dumps(self.label_names)) + np.save(os.path.join(self.data_dir, f'{label_index}'), self.label_data[label_index]) + + + + + + def run(self) -> None: + """ + Run GestureVis + """ + hands_config = PoseConfig(model_complexity = 0) + + self.cap_handler = CaptureHandler(self.sources, self.resolutions, [PoseExtension(PoseConfig)]) + self.dis_handler = DisplayHandler(50, {"PoseExtension": PoseExtension}) + self.perf = PerfUtility() + + self.dis_handler.register_key_callback(self.on_key) + self.dis_handler.register_post_render_callback(self.draw_annotations) + self.cap_handler.start_workers() + num_sources = len(self.sources) + self.annotation_infos = collections.deque(maxlen = num_sources) + + while self.running: + self.perf.update_start() + + results = None + try: + results = self.cap_handler.get_captures() + except AllCapturesFinished: + self.running = False + logger.info(" capture sources have finished playing, exiting") + continue + captures = [results[i][0] for i in range(num_sources)] + extensions = [results[i][1] for i in range(num_sources)] + print(f'len - {len(extensions)}') + + for i in range(num_sources): + mp_screen_keypoints = extensions[i]["PoseExtension"]["pose_landmarks"] + mp_world_keypoints = extensions[i]["PoseExtension"]["pose_world_landmarks"] + + + + capture = captures[i] + pose_bounds = self.get_bound_data(mp_screen_keypoints, mp_world_keypoints, capture.frame) + self.pose_bound = pose_bounds + # self.annotation_infos.append(AnnotationInfo(hand_labels, hand_bounds, gesture_data, self.mode == GV_MODE.VISUALIZATION)) + + if self.mode == GV_MODE.LABEL_INPUT: + cv2.putText(capture.frame, "Define or select label: press to exit", (10, 24), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, f"Specify label name: {self.label_name}_", (10, 24 + (18 * 1)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, f"Defined labels: {self.label_names}", (10, 24 + (18 * 2)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, "Press to confirm", (10, 24 + (18 * 3)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + elif self.mode == GV_MODE.COLLECTION: + cv2.putText(capture.frame, "Collect data points: press to exit", (10, 24), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + label_index = self.label_names.index(self.label_name) + cv2.putText(capture.frame, f"Label: {self.label_name}, index: {label_index}", (10, 24 + (18 * 1)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + num_points = np.ma.size(self.label_data[label_index], axis = 0) + cv2.putText(capture.frame, f"Data points: {num_points}", (10, 24 + (18 * 2)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + cv2.putText(capture.frame, "Press to collect data point", (10, 24 + (18 * 3)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 1, cv2.LINE_AA) + + self.dis_handler.update_frames(captures, extensions) + self.dis_handler.update_windows(0) + + time.sleep(self.perf.get_remaining_sleep_time(self.resolutions[0][2])) + self.perf.update_end() + self.cleanup() + + + + + + + + + + + + def cleanup(self) -> None: + self.cap_handler.cleanup() + self.dis_handler.cleanup() + for writer in self.video_writers: + writer.release() + + + + +parser = ap.ArgumentParser() +parser.add_argument("--sources", type = str, nargs = "*", help = "which sources to stream (url, device id, video, or image directory)", action = "store", required = False) +parser.add_argument("--resolutions", type = str, nargs = "*", help = "specify resolution/framerate per stream; format is :xx (default *:1280x720x30)", action = "store", required = False) +default_dir = f"webcam{os.sep}pose_vis{os.sep}gesture{os.sep}hand{os.sep}data" +parser.add_argument("--data-dir", type = str, nargs = "?", const = default_dir, default = default_dir, help = f"set data directory (default: {default_dir})", action = "store", required = False) +parser.add_argument("--export", type = str, nargs = "*", help = "export annotated stream as video file", action = "store", required = False) +parser.add_argument("--export-format", type = str, nargs = "?", const = "MP4V", default = "MP4V", help = "format to write exported video in (default: H264)", action = "store", required = False) + + + +if __name__ == '__main__': + args = parser.parse_args() + + sources = parse_sources(args.sources) + resolutions = parse_resolutions(len(sources), args.resolutions if args.resolutions is not None else []) + export_files = [] + if args.export is not None: + print('export is not NONE') + for _file in args.export: + export_files.append(absolute_path(_file)) + print(export_files) + PoseGestureVis(sources, resolutions, absolute_path(args.data_dir), export_files, args.export_format).run() \ No newline at end of file diff --git a/devices/webcam/pose_vis/performance_utility.py b/devices/webcam/pose_vis/performance_utility.py new file mode 100644 index 00000000..438abc47 --- /dev/null +++ b/devices/webcam/pose_vis/performance_utility.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import time + +class PerfUtility(): + """ + Utility class to measure function execution time + + Attributes: + `delta_time`: `float` time in seconds between `update_start()` and `update_end()` + `last_update_start`: `float` time in seconds when `update_start()` was last called + `update_timer`: `float` time between `update_end()` calls, if >= 1 second, resets to 0 + `update_count`: `int` number of times `update_end()` is called, reset to 0 with `update_timer` + `updates_per_second`: `int` the value of the last `update_count` before reset + `averaged`: `bool` whether `update_timer` has reached >= 1 second for the first time + + Functions: + `update_start(self) -> None` + + `update_end(self) -> None` + + `get_remaining_sleep_time(self, target_update_rate: int) -> float` + """ + + delta_time: float = 0.0 + last_update_start: float = 0.0 + update_timer: float = 0.0 + update_count: int = 0 + updates_per_second: int = 0 + averaged: bool = False + + def update_start(self) -> None: + """ + Begin profiling function time + + Sets `last_update_start` to the current time + """ + self.last_update_start = time.perf_counter() + + def update_end(self) -> None: + """ + End profiling function time + + Sets `delta_time`, `updates_per_second` + """ + self.delta_time = time.perf_counter() - self.last_update_start + self.update_timer += self.delta_time + self.update_count += 1 + if not self.averaged: + self.updates_per_second = self.update_count + + if self.update_timer >= 1.0: + self.updates_per_second = self.update_count + self.update_timer = 0 + self.update_count = 0 + self.averaged = True + + def get_remaining_sleep_time(self, target_update_rate: int) -> float: + """ + Get wait time in seconds for functions repeating at set intervals + Takes `delta_time` into account + """ + target_delta_time = 1.0 / target_update_rate + actual_delta_time = time.perf_counter() - self.last_update_start + sleep_time = target_delta_time - actual_delta_time + return 0 if sleep_time < 0 else sleep_time \ No newline at end of file diff --git a/devices/webcam/pose_vis/pose_vis.py b/devices/webcam/pose_vis/pose_vis.py new file mode 100644 index 00000000..0fd05f51 --- /dev/null +++ b/devices/webcam/pose_vis/pose_vis.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +import os + +if os.name == "nt": + # Improve device capture startup time on Windows + # https://github.com/opencv/opencv/issues/17687 + os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0" + +import cProfile +import pstats +import logging +import argparse as ap +import pose_vis.extensions + +from pose_vis.extension import PoseVisExtension +from pose_vis.utils import parse_sources, parse_resolutions +from pose_vis.runner import PoseVisConfig +from pose_vis.runners.source_runner import SourceStreamRunner, SourceStreamRunnerConfig +from pose_vis.runners.replay_runner import ReplayStreamRunner, ReplayStreamRunnerConfig + +logger = logging.getLogger(__name__) + +parser = ap.ArgumentParser() +parser.add_argument("--sources", type = str, nargs = "*", help = "which sources to stream (url, device id, video, or image directory)", action = "store", required = False) +parser.add_argument("--resolutions", type = str, nargs = "*", help = "specify resolution/framerate per stream; format is :xx (default *:1280x720x30)", action = "store", required = False) +parser.add_argument("--replay", type = str, help = "replay a log file (default: none)", action = "store", required = False) +parser.add_argument("--display-framerate", type = int, nargs = "?", const = 60, default = 60, help = "specify update rate for video stream presentation; seperate from stream framerate (default: 60)", action = "store", required = False) +parser.add_argument("--stats-history-size", type = int, nargs = "?", const = 50, default = 50, help = "how many frames to base performance metrics on, 0 to disable (default: 50)", action = "store", required = False) +parser.add_argument("--logging", help = "enable logging (default: false)", action = "store_true", required = False) +default_dir = f"webcam{os.sep}logs" +parser.add_argument("--log-dir", type = str, nargs = "?", const = default_dir, default = default_dir, help = f"set log directory (default: {default_dir})", action = "store", required = False) +parser.add_argument("--log-name", type = str, help = "set log name (default: random)", action = "store", required = False) +parser.add_argument("--profile", help = "enable profiling with cProfile *source streaming only (default: false)", action = "store_true", required = False) + +if __name__ == "__main__": + """ + Run the graph via command line arguments + """ + + # Get a list of every available extension + extensions = [] + for cls in PoseVisExtension.__subclasses__(): + extensions.append(cls()) + + # Let extensions register arguments + ext: PoseVisExtension + for ext in extensions: + ext.register_args(parser) + + args = parser.parse_args() + + if args.sources is None and args.replay is None: + raise ValueError("Please specify sources to stream or a log to replay") + + enabled_extensions = [] + # Check if an extension is enabled via its argument + for ext in extensions: + if ext.check_enabled(args): + enabled_extensions.append(ext) + + config = PoseVisConfig(extensions = enabled_extensions, + log_directory = args.log_dir, + log_name = args.log_name, + enable_logging = args.logging, + display_framerate = args.display_framerate, + stats_history_size = args.stats_history_size) + + if args.replay is None: + # Initiate camera streaming + sources = parse_sources(args.sources) + resolutions = parse_resolutions(len(sources), args.resolutions if args.resolutions is not None else []) + + # Build and run the graph + runner_config = SourceStreamRunnerConfig( + sources = sources, + resolutions = resolutions) + runner = SourceStreamRunner(config, runner_config) + runner.build() + + if args.profile: + with cProfile.Profile() as pr: + runner.run() + stats = pstats.Stats(pr) + stats.dump_stats(filename = "pose_vis.prof") + logger.info(" saved 'pose_vis.prof' in the current working directory") + else: + runner.run() + else: + # Initiate log replay + # Build and run the graph + runner_config = ReplayStreamRunnerConfig(args.replay) + runner = ReplayStreamRunner(config, runner_config) + runner.build() + runner.run() \ No newline at end of file diff --git a/devices/webcam/pose_vis/pose_vis_graph.py b/devices/webcam/pose_vis/pose_vis_graph.py new file mode 100644 index 00000000..c013273d --- /dev/null +++ b/devices/webcam/pose_vis/pose_vis_graph.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +from pose_vis.dynamic_graph import DynamicGraph + +class PoseVis(DynamicGraph): + """ + Create an instance of DynamicGraph. It is built with `PoseVisRunner` + """ + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/runner.py b/devices/webcam/pose_vis/runner.py new file mode 100644 index 00000000..96c52ee3 --- /dev/null +++ b/devices/webcam/pose_vis/runner.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging +import labgraph as lg + +from pose_vis.utils import absolute_path +from pose_vis.pose_vis_graph import PoseVis +from pose_vis.extension import PoseVisExtension +from pose_vis.display import Display, DisplayConfig +from pose_vis.termination_handler import TerminationHandler +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List, Optional + +logger = logging.getLogger(__name__) + +@dataclass +class PoseVisConfig(): + """ + Config for `PoseVisRunner` parent class + + Attributes: + `extensions`: `List[PoseVisExtension]` list of extensions to enable + `log_directory`: `str` directory to save log files + `log_name`: `Optional[str]` name of log file (random if None) + `enable_logging`: `bool` enable logging or not + `display_framerate`: `int` how fast to run display windows (0 to disable) + `stats_history_size`: `int` how many frames to keep track of for performance tracking (0 to disable) this is not enabled if `display_framerate` is 0 + """ + extensions: List[PoseVisExtension] + log_directory: str + log_name: Optional[str] + enable_logging: bool + display_framerate: int + stats_history_size: int + +class PoseVisRunner(ABC): + """ + Parent runner class that takes care of basic graph setup + """ + config: PoseVisConfig + + def __init__(self, config: PoseVisConfig) -> None: + self.config = config + + def build(self) -> None: + """ + Build the `PoseVis` graph + """ + logger.info(" building graph") + + # Enable extensions + for i in range(len(self.config.extensions)): + ext: PoseVisExtension = self.config.extensions[i] + logger.info(f" enabling extension: {ext.__class__.__name__}") + ext.set_enabled(i) + + # Check if provided log path is a full directory or relative + self.config.log_directory = absolute_path(self.config.log_directory) + logger.info(f" logging directory is {self.config.log_directory}") + + self.register_nodes() + + if self.config.display_framerate > 0: + ext_types = {} + for cls in PoseVisExtension.__subclasses__(): + ext_types[cls.__name__] = cls + PoseVis.add_node("DISPLAY", Display, ["STREAM", "OUTPUT", "DISPLAY", "INPUT"], DisplayConfig( + target_framerate = self.config.display_framerate, + stats_history_size = self.config.stats_history_size, + extension_types = ext_types)) + PoseVis.add_connection(["STREAM", "OUTPUT_EXIT", "DISPLAY", "INPUT_EXIT_STREAM"]) + else: + PoseVis.add_node("TERM_HANDLER", TerminationHandler) + PoseVis.add_connection(["STREAM", "OUTPUT_EXIT", "TERM_HANDLER", "INPUT_EXIT_STREAM"]) + + if self.config.enable_logging: + PoseVis.add_logger_connection(("captures", "STREAM", "OUTPUT")) + + def run(self) -> None: + """ + Run the `PoseVis` graph + """ + logger_config: lg.LoggerConfig + if self.config.log_name: + logger_config = lg.LoggerConfig(output_directory = self.config.log_directory, recording_name = self.config.log_name) + else: + logger_config = lg.LoggerConfig(output_directory = self.config.log_directory) + + logger.info(" running graph") + graph = PoseVis() + runner_options = lg.RunnerOptions(logger_config = logger_config) + runner = lg.ParallelRunner(graph = graph, options = runner_options) + runner.run() + + @abstractmethod + def register_nodes(self) -> None: + """ + Function called for derived classes to add their specific nodes + """ + raise NotImplementedError diff --git a/devices/webcam/pose_vis/runners/__init__.py b/devices/webcam/pose_vis/runners/__init__.py new file mode 100644 index 00000000..948e94e7 --- /dev/null +++ b/devices/webcam/pose_vis/runners/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. diff --git a/devices/webcam/pose_vis/runners/benchmark_runner.py b/devices/webcam/pose_vis/runners/benchmark_runner.py new file mode 100644 index 00000000..5346a962 --- /dev/null +++ b/devices/webcam/pose_vis/runners/benchmark_runner.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging + +from dataclasses import dataclass +from pose_vis.runner import PoseVisRunner, PoseVisConfig +from pose_vis.runners.source_runner import SourceStreamRunner, SourceStreamRunnerConfig +from pose_vis.benchmark import Benchmark, BenchmarkConfig +from pose_vis.utils import absolute_path +from pose_vis.pose_vis_graph import PoseVis +from typing import List, Tuple, Union + +logger = logging.getLogger(__name__) + +@dataclass +class BenchmarkRunnerConfig(): + """ + Config for BenchmarkRunner + + Attributes: + `sources`: `List[Union[int, str]]` + `resolutions`: `List[Tuple[int, int, int]]` + `output_path`: `str` + `output_name`: `str` + `run_time`: `int` + """ + sources: List[Union[int, str]] + resolutions: List[Tuple[int, int, int]] + output_path: str + output_name: str + run_time: int + +class BenchmarkRunner(PoseVisRunner): + """ + Runs the `Benchmark` node for gathering performance details + """ + runner_config: BenchmarkRunnerConfig + + def __init__(self, config: PoseVisConfig, runner_config: BenchmarkRunnerConfig) -> None: + self.runner_config = runner_config + super().__init__(config) + + def register_nodes(self) -> None: + self.runner_config.output_path = absolute_path(self.runner_config.output_path) + logger.info(f" benchmark output path is {self.runner_config.output_path}") + + srunner = SourceStreamRunner(self.config, SourceStreamRunnerConfig(self.runner_config.sources, self.runner_config.resolutions)) + srunner.register_nodes() + + PoseVis.add_node("BENCHMARK", Benchmark, ["STREAM", "OUTPUT", "BENCHMARK", "INPUT"], BenchmarkConfig( + self.runner_config.output_path, + self.runner_config.output_name, + self.runner_config.run_time)) + + if self.config.display_framerate > 0: + PoseVis.add_connection(["BENCHMARK", "OUTPUT_EXIT", "DISPLAY", "INPUT_EXIT_USER"]) + else: + PoseVis.add_connection(["BENCHMARK", "OUTPUT_EXIT", "TERM_HANDLER", "INPUT_EXIT_USER"]) \ No newline at end of file diff --git a/devices/webcam/pose_vis/runners/replay_runner.py b/devices/webcam/pose_vis/runners/replay_runner.py new file mode 100644 index 00000000..96e42646 --- /dev/null +++ b/devices/webcam/pose_vis/runners/replay_runner.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging + +from pose_vis.utils import absolute_path +from pose_vis.runner import PoseVisRunner, PoseVisConfig +from pose_vis.streams.replay_stream import ReplayStream, ReplayStreamConfig +from pose_vis.pose_vis_graph import PoseVis +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +@dataclass +class ReplayStreamRunnerConfig(): + """ + Config for ReplayStreamRunner + + Attributes: + `path`: `str` + """ + path: str + +class ReplayStreamRunner(PoseVisRunner): + """ + Runs the `ReplayStream` node to replay log files + """ + runner_config: ReplayStreamRunnerConfig + + def __init__(self, config: PoseVisConfig, runner_config: ReplayStreamRunnerConfig) -> None: + self.runner_config = runner_config + super().__init__(config) + + def register_nodes(self) -> None: + self.runner_config.path = absolute_path(self.runner_config.path) + + PoseVis.add_node("STREAM", ReplayStream, config = ReplayStreamConfig( + self.config.extensions, + self.runner_config.path)) \ No newline at end of file diff --git a/devices/webcam/pose_vis/runners/source_runner.py b/devices/webcam/pose_vis/runners/source_runner.py new file mode 100644 index 00000000..0520756f --- /dev/null +++ b/devices/webcam/pose_vis/runners/source_runner.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import logging + +from dataclasses import dataclass +from pose_vis.runner import PoseVisRunner, PoseVisConfig +from pose_vis.streams.source_stream import SourceStream, SourceStreamConfig +from pose_vis.pose_vis_graph import PoseVis +from typing import List, Tuple, Union + +logger = logging.getLogger(__name__) + +@dataclass +class SourceStreamRunnerConfig(): + """ + Config for SourceStreamRunner + + Attributes: + `sources`: `List[Union[int, str]]` + `resolutions`: `List[Tuple[int, int, int]]` + """ + sources: List[Union[int, str]] + resolutions: List[Tuple[int, int, int]] + +class SourceStreamRunner(PoseVisRunner): + """ + Runs the `SourceStream` node to capture video sources + """ + runner_config: SourceStreamRunnerConfig + + def __init__(self, config: PoseVisConfig, runner_config: SourceStreamRunnerConfig) -> None: + self.runner_config = runner_config + super().__init__(config) + + def register_nodes(self) -> int: + # Sort by framerate to select lowest value + # The stream shouldn't run faster than the slowest source + _sorted = sorted(self.runner_config.resolutions, key = lambda r: r[2]) + PoseVis.add_node("STREAM", SourceStream, config = SourceStreamConfig( + self.runner_config.sources, + self.runner_config.resolutions, + self.config.extensions, + _sorted[0][2])) + + \ No newline at end of file diff --git a/devices/webcam/pose_vis/stats_worker.py b/devices/webcam/pose_vis/stats_worker.py new file mode 100644 index 00000000..9a540aec --- /dev/null +++ b/devices/webcam/pose_vis/stats_worker.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import multiprocessing +import logging +import collections +import statistics + +from typing import List +from pose_vis.utils import relative_latency + +logger = logging.getLogger(__name__) + +class CapturePoint(): + """ + Represents a captured frame's point in time + """ + __slots__ = "frame_index", "runtime", "target_fps", "receive_time", "device_timestamps" + + def __init__(self, frame_index: int, runtime: float, target_fps: int, receive_time: float, device_timestamps: List[float]) -> None: + self.frame_index = frame_index + self.runtime = runtime + self.target_fps = target_fps + self.receive_time = receive_time + self.device_timestamps = device_timestamps + +class CaptureStats(): + """ + Represents performance statistics + """ + __slots__ = "framedrop", "latency", "jitter", "desync" + + def __init__(self, framedrop: float, latency: float, jitter: float, desync: List[float]): + self.framedrop = framedrop + self.latency = latency + self.jitter = jitter + self.desync = desync + +class StatsWorker(multiprocessing.Process): + """ + Processes performance stats without skewing results + """ + tasks: multiprocessing.JoinableQueue + results: multiprocessing.Queue + history_size: int + timestamps: collections.deque + desync_vals: List[collections.deque] + latency_vals: collections.deque + + def __init__(self, tasks: multiprocessing.JoinableQueue, results: multiprocessing.Queue, history_size: int) -> None: + self.tasks = tasks + self.results = results + self.history_size = history_size + super().__init__() + + def setup(self) -> None: + self.timestamps = collections.deque(maxlen = self.history_size) + self.latency_vals = collections.deque(maxlen = self.history_size) + self.desync_vals = [] + + def run(self) -> None: + self.setup() + while True: + point: CapturePoint = self.tasks.get() + if point is not None: + extra_sources = 0 + + # time created, time received + self.timestamps.append((point.device_timestamps[0], point.receive_time)) + if len(self.timestamps) > 1: + extra_sources = len(point.device_timestamps) - 1 + if extra_sources > 0: + if len(self.desync_vals) != extra_sources: + self.desync_vals = [collections.deque(maxlen = self.history_size)] * extra_sources + for i in range(extra_sources): + self.desync_vals[i].append(abs(point.device_timestamps[i + 1] - point.device_timestamps[0])) + + # TODO: this may not be correct + # `rel_device` can be greater than `rel_receive`, for now we just take the absolute value + self.latency_vals.append(abs(relative_latency(point.device_timestamps[0], point.receive_time, self.timestamps[0][0], self.timestamps[0][1]))) + + expected_frames = round(point.runtime * point.target_fps) + framedrop = (point.frame_index / (1 if expected_frames == 0 else expected_frames)) * 100 + + latency = 0.0 + jitter = 0.0 + desync = [] + if len(self.latency_vals) > 1: + latency = statistics.median(self.latency_vals) + jitter = statistics.stdev(self.latency_vals) + + if extra_sources > 0: + desync = [statistics.median(li) for li in self.desync_vals] + + stats = CaptureStats( + framedrop, + latency, + jitter, + desync) + + self.results.put(stats) + self.tasks.task_done() + else: + self.tasks.task_done() + break \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/__init__.py b/devices/webcam/pose_vis/streams/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/streams/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/messages.py b/devices/webcam/pose_vis/streams/messages.py new file mode 100644 index 00000000..bb58796d --- /dev/null +++ b/devices/webcam/pose_vis/streams/messages.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import labgraph as lg +import numpy as np + +from dataclasses import dataclass +from typing import Dict, Any, List + +@dataclass +class Capture(): + """ + Represents a single video frame + + Attributes: + `frame`: `np.ndarray` video frame matrix, shape is (H, W, 3), RGB color space, short datatype + `stream_id`: `int` source index for this capture + `frame_index`: `int` total frames since startup + `system_timestamp`: `float` `time.perf_counter()` value for when this frame was created + `proc_delta_time`: `float` time in seconds this frame took to produce + `proc_runtime`: `float` total runtime for this source + `proc_fps`: `int` frames per second at the time this frame was produced + `proc_target_fps`: `int` target frames per second for this source + """ + frame: np.ndarray + stream_id: int + frame_index: int + system_timestamp: float + proc_delta_time: float + proc_runtime: float + proc_fps: int + proc_target_fps: int + +class CaptureResult(lg.Message): + """ + Represents the current frame from every capture source + + Attributes: + `captures`: `List[Capture]` `Capture`s by source index + `extensions`: `List[Dict[str, Any]]` extension data by source index + """ + captures: List[Capture] + extensions: List[Dict[str, Any]] + +class ExitSignal(lg.Message): + """ + Passed to `Display` or `TerminationHandler` when a stream wants to close the graph + """ + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/replay_stream.py b/devices/webcam/pose_vis/streams/replay_stream.py new file mode 100644 index 00000000..40f5ea54 --- /dev/null +++ b/devices/webcam/pose_vis/streams/replay_stream.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +import os +if os.name == "nt": + # Improve sleep timer resolution for this process on Windows + # https://learn.microsoft.com/en-us/windows/win32/api/timeapi/nf-timeapi-timebeginperiod + import ctypes + winmm = ctypes.WinDLL('winmm') + winmm.timeBeginPeriod(1) + +import logging +import asyncio +import labgraph as lg + +from pose_vis.streams.messages import CaptureResult, ExitSignal +from pose_vis.extension import PoseVisExtension +from pose_vis.performance_utility import PerfUtility +from typing import Optional, List +from labgraph.loggers.hdf5.reader import HDF5Reader + +logger = logging.getLogger(__name__) + +class ReplayStreamConfig(lg.Config): + """ + Config for ReplayStream + + Attributes: + `extensions`: `List[PoseVisExtension]` extension objects to be executed on each frame + `log_path`: `str` absolute path to log file + """ + extensions: List[PoseVisExtension] + log_path: str + +class ReplayStreamState(lg.State): + """ + State for ReplayStream + + Attributes: + `perf`: `PerfUtility` utility to track frames per second + `reader`: `Optional[HDF5Reader]` log file reader utility + """ + # Using optional here as LabGraph expects state attributes to be initialized + # We create and assign these objects during setup + perf: PerfUtility = PerfUtility() + reader: Optional[HDF5Reader] = None + +class ReplayStream(lg.Node): + """ + Replays log files + + Topics: + `OUTPUT`: `CaptureResult` + + Attributes: + `config`: `LogStreamConfig` + `state`: `LogStreamState` + """ + OUTPUT = lg.Topic(CaptureResult) + OUTPUT_EXIT = lg.Topic(ExitSignal) + config: ReplayStreamConfig + state: ReplayStreamState + + @lg.publisher(OUTPUT) + @lg.publisher(OUTPUT_EXIT) + async def read_log(self) -> lg.AsyncPublisher: + num_captures = len(self.state.reader.logs["captures"]) + for i in range(num_captures): + self.state.perf.update_start() + + # TODO: run extensions if enabled + capture: CaptureResult = self.state.reader.logs["captures"][i] + yield self.OUTPUT, capture + + await asyncio.sleep(capture.stream_time) + + self.state.perf.update_end() + + logger.info(" log replay finished") + yield self.OUTPUT_EXIT, ExitSignal() + + def setup(self) -> None: + logger.info(f" reading log: {self.config.log_path}") + self.state.reader = HDF5Reader(self.config.log_path, {"captures": CaptureResult}) + + def cleanup(self) -> None: + pass \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/source_stream.py b/devices/webcam/pose_vis/streams/source_stream.py new file mode 100644 index 00000000..f44c9e3b --- /dev/null +++ b/devices/webcam/pose_vis/streams/source_stream.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# Windows-specific performance tuning +import os +if os.name == "nt": + # Improve sleep timer resolution for this process on Windows + # https://learn.microsoft.com/en-us/windows/win32/api/timeapi/nf-timeapi-timebeginperiod + import ctypes + winmm = ctypes.WinDLL('winmm') + winmm.timeBeginPeriod(1) + +import time +import logging +import traceback +import asyncio +import labgraph as lg + +from typing import Tuple, List, Union, Optional +from pose_vis.extension import PoseVisExtension +from pose_vis.performance_utility import PerfUtility +from pose_vis.streams.messages import CaptureResult, ExitSignal +from pose_vis.streams.utils.capture_handler import CaptureHandler, AllCapturesFinished + +logger = logging.getLogger(__name__) + +class SourceStreamConfig(lg.Config): + """ + Config for `SourceStream` + + Attributes: + `sources`: `List[Union[int, str]]` which sources to initialize + `resolutions`: `List[Tuple[int, int, int]]` resolution per source + NOTE: `SourceStream` will run at the lowest provided framerate + `extensions`: `List[PoseVisExtension]` extension instances to be initialized in each source + `target_framerate`: `int` target framerate for this node to achieve + """ + sources: List[Union[int, str]] + resolutions: List[Tuple[int, int, int]] + extensions: List[PoseVisExtension] + target_framerate: int + +class SourceStreamState(lg.State): + """ + State for `SourceStream` + + Attributes: + `handler`: `Optional[CaptureHandler]` + `perf`: `PerfUtility` + """ + handler: Optional[CaptureHandler] = None + perf: PerfUtility = PerfUtility() + +class SourceStream(lg.Node): + """ + Captures frames from all sources and publishes them with a `CaptureResult` message + + Topics: + `OUTPUT`: publishes `CaptureResult` + `OUTPUT_EXIT` publishes `ExitSignal` + """ + OUTPUT = lg.Topic(CaptureResult) + OUTPUT_EXIT = lg.Topic(ExitSignal) + config: SourceStreamConfig + state: SourceStreamState + + def setup(self) -> None: + self.state.handler = CaptureHandler(self.config.sources, self.config.resolutions, self.config.extensions) + + @lg.publisher(OUTPUT) + @lg.publisher(OUTPUT_EXIT) + async def read_sources(self) -> lg.AsyncPublisher: + has_video_source = False + + num_sources = len(self.config.sources) + for i in range(num_sources): + if isinstance(self.config.sources[i], str): + has_video_source = True + + self.state.handler.start_workers() + + while True: + try: + self.state.perf.update_start() + + result = self.state.handler.get_captures() + yield self.OUTPUT, CaptureResult( + [result[i][0] for i in range(num_sources)], + [result[i][1] for i in range(num_sources)]) + + if has_video_source: + # This gives a more accurate sleep period on Windows without stalling the node + # Doesn't seem to hurt on other systems + wait_time = self.state.perf.get_remaining_sleep_time(self.config.target_framerate) + wait_start = time.perf_counter() + while time.perf_counter() - wait_start < wait_time: + await asyncio.sleep(0.001) + else: + # When capturing from a device, CV2 VideoCapture will block to keep the configured framerate + # a small sleep period keeps the node from stalling + await asyncio.sleep(0.002) + + self.state.perf.update_end() + except Exception as e: + if not isinstance(e, AllCapturesFinished): + logger.critical(traceback.format_exc()) + logger.critical(" an exception occurred in a source thread or process, exiting") + else: + logger.info(" all captures have finished") + break + + yield self.OUTPUT_EXIT, ExitSignal() + + def cleanup(self) -> None: + self.state.handler.cleanup() \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/utils/__init__.py b/devices/webcam/pose_vis/streams/utils/__init__.py new file mode 100644 index 00000000..cd1c80d0 --- /dev/null +++ b/devices/webcam/pose_vis/streams/utils/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/utils/capture_handler.py b/devices/webcam/pose_vis/streams/utils/capture_handler.py new file mode 100644 index 00000000..78594458 --- /dev/null +++ b/devices/webcam/pose_vis/streams/utils/capture_handler.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import os +import multiprocessing +import logging +import numpy as np + +from multiprocessing import shared_memory +from typing import Tuple, List, Union, Dict, Any +from pose_vis.streams.utils.capture_worker import CaptureWorker +from pose_vis.streams.messages import Capture +from pose_vis.extension import PoseVisExtension + +logger = logging.getLogger(__name__) + +class AllCapturesFinished(Exception): + """ + Raised when all captures finish (e.g video files are done playing, image directories are processed) + """ + pass + +class CaptureHandler(): + """ + Handles capturing video frames via `CaptureWorker` objects + """ + sources: List[Union[int, str]] + resolutions: List[Tuple[int, int, int]] + extensions: List[PoseVisExtension] + shared_mems: List[shared_memory.SharedMemory] = [] + workers: List[CaptureWorker] = [] + # PipeConnection on Windows, different on Linux/Mac + connections: List[Tuple[Any, Any]] = [] + finished_captures: List[int] = [] + num_sources: int = 0 + + def __init__(self, sources: List[Union[int, str]], resolutions: List[Tuple[int, int, int]], extensions: List[PoseVisExtension]) -> None: + self.sources = sources + self.resolutions = resolutions + self.extensions = extensions + + self.num_sources = len(self.sources) + for i in range(self.num_sources): + size = np.dtype(np.uint8).itemsize * np.prod((self.resolutions[i][1], self.resolutions[i][0], 3)) + try: + self.shared_mems.append(shared_memory.SharedMemory(create = True, size = size.item(), name = f"pv_worker_{i}")) + except FileExistsError as e: + if os.name == "nt": + # This can happen if the program crashes and leaves a stray processes running + # Windows will automatically delete the file when it's not in use by a process + # https://github.com/python/cpython/issues/85059 + logger.critical(" shared memory file already exists. Please terminate any stray Python processes and try again") + raise e + else: + shm = shared_memory.SharedMemory(name = f"pv_worker_{i}") + shm.close() + shm.unlink() + # Unlink and re-create incase the array size has changed + self.shared_mems.append(shared_memory.SharedMemory(create = True, size = size.item(), name = f"pv_worker_{i}")) + + outbound = multiprocessing.Pipe() + inbound = multiprocessing.Pipe() + self.connections.append((outbound[0], inbound[0])) + self.workers.append(CaptureWorker( + (outbound[1], inbound[1]), + self.extensions, + self.sources[i], + np.asarray(self.resolutions[i]), + i)) + + def start_workers(self) -> None: + for i in range(self.num_sources): + self.workers[i].start() + + def get_captures(self) -> List[Tuple[Capture, Dict[str, Any], bool]]: + if len(self.finished_captures) == self.num_sources: + raise AllCapturesFinished + + for i in range(self.num_sources): + if i not in self.finished_captures: + self.connections[i][0].send(True) + + result: List[Tuple[Capture, Dict[str, Any], bool]] = [self.connections[i][1].recv() for i in range(self.num_sources) if i not in self.finished_captures] + for idx, res in enumerate(result): + res[0].frame = np.ndarray(shape = (self.resolutions[i][1], self.resolutions[i][0], 3), dtype = np.uint8, buffer = self.shared_mems[idx].buf)[:] + + for res in result: + if res[2]: + self.finished_captures.append(res[0].stream_id) + + return result + + def cleanup(self) -> None: + for con in self.connections: + con[0].send(False) + for worker in self.workers: + worker.join() + for shm in self.shared_mems: + shm.close() + shm.unlink() \ No newline at end of file diff --git a/devices/webcam/pose_vis/streams/utils/capture_worker.py b/devices/webcam/pose_vis/streams/utils/capture_worker.py new file mode 100644 index 00000000..4150ec40 --- /dev/null +++ b/devices/webcam/pose_vis/streams/utils/capture_worker.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import os +import time +import logging +import multiprocessing +import cv2 +import numpy as np + +from multiprocessing import shared_memory +from typing import List, Tuple, Union, Optional, Any +from pose_vis.streams.messages import Capture +from pose_vis.extension import PoseVisExtension +from pose_vis.performance_utility import PerfUtility +from pose_vis.utils import is_path + +# https://docs.opencv.org/4.2.0/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56 +SUPPORTED_IMG_EXTENSIONS = [".bmp", ".jpeg", ".jpg", ".png", ".webp"] + +logger = logging.getLogger(__name__) + +class CaptureWorker(multiprocessing.Process): + """ + Handles setup of CV2 VideoCapture, frame indexing, runtime tracking, and communication with its parent process + """ + connections: Tuple[Any, Any] + shared_mem: shared_memory.SharedMemory + extensions: List[PoseVisExtension] + worker_number: int + cap_source: Union[int, str] + resolution: np.ndarray + images: List[str] = [] + capture_finished: bool = False + frame_index: int = 0 + start_time: float = 0.0 + blank_frame: Optional[np.ndarray] = None + capture: Optional[cv2.VideoCapture] = None + perf: Optional[PerfUtility] = None + + def __init__(self, connections: Tuple[Any, Any], extensions: List[PoseVisExtension], cap_source: Union[int, str], resolution: np.ndarray, worker_number: int): + self.connections = connections + self.extensions = extensions + self.cap_source = cap_source + self.resolution = resolution + self.worker_number = worker_number + super().__init__() + + def setup(self) -> None: + self.perf = PerfUtility() + + self.shared_mem = shared_memory.SharedMemory(name = f"pv_worker_{self.worker_number}") + self.blank_frame = np.zeros(shape = (self.resolution[1], self.resolution[0], 3), dtype = np.uint8) + + self.open_capture() + + for ext in self.extensions: + logger.info(f" worker {self.worker_number}: setting up extension {ext.__class__.__name__}") + ext.setup() + logger.info(f" worker {self.worker_number}: started") + + def open_capture(self) -> None: + if isinstance(self.cap_source, int) or os.path.isfile(self.cap_source) or self.cap_source.find(" ! ") > -1: + logger.info(f" opening source [{self.cap_source}]") + + backend = cv2.CAP_MSMF if os.name == "nt" else cv2.CAP_V4L2 + if isinstance(self.cap_source, str): + if not is_path(self.cap_source): + backend = cv2.CAP_GSTREAMER + + self.capture = cv2.VideoCapture(self.cap_source, backend) + + if self.capture.isOpened(): + if backend != cv2.CAP_GSTREAMER: + self.capture.set(cv2.CAP_PROP_FRAME_WIDTH, self.resolution[0]) + self.capture.set(cv2.CAP_PROP_FRAME_HEIGHT, self.resolution[1]) + self.capture.set(cv2.CAP_PROP_FPS, self.resolution[2]) + self.capture.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc("M", "J", "P", "G")) + self.capture.set(cv2.CAP_PROP_BUFFERSIZE, 1) + self.capture.set(cv2.CAP_PROP_AUTOFOCUS, 0) + logger.info(f" source [{self.cap_source}] opened with backend: {self.capture.getBackendName()}") + else: + logger.warning(f" source [{self.cap_source}] could not be opened") + + else: + logger.info(f" opening directory: {self.cap_source}") + for _file in os.listdir(self.cap_source): + ext = os.path.splitext(_file)[1] + if ext.lower() in SUPPORTED_IMG_EXTENSIONS: + self.images.append(os.path.join(self.cap_source, _file)) + logger.info(f" found {len(self.images)} image(s)") + + def read_capture(self) -> Tuple[np.ndarray, float]: + num_images = len(self.images) + + if self.capture is not None and self.capture.isOpened(): + success, frame = self.capture.read() + timestamp = time.perf_counter() + + if success: + if isinstance(self.cap_source, str): + if frame.shape != (self.resolution[1], self.resolution[0], 3): + frame = cv2.resize(frame, (self.resolution[0], self.resolution[1]), interpolation = cv2.INTER_NEAREST) + frame = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB) + + frame.flags.writeable = False + self.frame_index += 1 + return (frame, timestamp) + + elif isinstance(self.cap_source, str): + self.capture_finished = True + else: + logger.warning(f" source {self.cap_source} gave unsuccessful grab()") + + elif num_images > 0 and self.frame_index < num_images: + if self.frame_index + 1 >= num_images: + self.capture_finished = True + + frame = cv2.imread(self.images[self.frame_index], cv2.IMREAD_COLOR) + frame = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB) + self.frame_index += 1 + return (frame, time.perf_counter()) + + return (self.blank_frame.copy(), time.time()) + + def cleanup(self) -> None: + if self.capture is not None and self.capture.isOpened(): + self.capture.release() + logger.info(f" source [{self.cap_source}] released") + logger.info(f" worker {self.worker_number}: shutting down") + for ext in self.extensions: + ext.cleanup() + + def run(self) -> None: + self.setup() + + while True: + if self.start_time == 0: + self.start_time = time.perf_counter() + + self.perf.update_start() + instruction: bool = self.connections[0].recv() + + if instruction: + frame, timestamp = self.read_capture() + dst = np.ndarray(shape = frame.shape, dtype = frame.dtype, buffer = self.shared_mem.buf) + dst[:] = frame[:] + cap = Capture( + None, + self.worker_number, + self.frame_index, + timestamp, + self.perf.delta_time if self.perf.delta_time > 0 else 1 / self.resolution[2], + time.perf_counter() - self.start_time, + self.perf.updates_per_second, + self.resolution[2]) + + ext_results = {} + for ext in self.extensions: + ext_result = ext.process_frame(frame) + ext_results[ext.__class__.__name__] = ext_result.data + + self.connections[1].send((cap, ext_results, self.capture_finished)) + else: + break + + self.perf.update_end() + + self.cleanup() \ No newline at end of file diff --git a/devices/webcam/pose_vis/termination_handler.py b/devices/webcam/pose_vis/termination_handler.py new file mode 100644 index 00000000..e51a11e1 --- /dev/null +++ b/devices/webcam/pose_vis/termination_handler.py @@ -0,0 +1,43 @@ +import time +import labgraph as lg + +from pose_vis.streams.messages import ExitSignal + +class TerminationHandlerState(lg.State): + """ + State for TerminationHandler + + Attributes: + `signal_received`: `bool` + """ + signal_received: bool = False + +class TerminationHandler(lg.Node): + """ + Terminates the graph cleanly upon receiving `ExitSignal` + + Topics: + `INPUT`: `ExitSignal` + + Attributes: + `state`: `TerminationHandlerState` + """ + INPUT_EXIT_STREAM = lg.Topic(ExitSignal) + INPUT_EXIT_USER = lg.Topic(ExitSignal) + state: TerminationHandlerState + + @lg.subscriber(INPUT_EXIT_STREAM) + async def on_exit_stream(self, _: ExitSignal) -> None: + self.state.signal_received = True + + @lg.subscriber(INPUT_EXIT_USER) + async def on_exit_user(self, _: ExitSignal) -> None: + self.state.signal_received = True + + @lg.main + def on_main(self) -> None: + while True: + if self.state.signal_received: + break + time.sleep(0.1) + raise lg.NormalTermination diff --git a/devices/webcam/pose_vis/test/tests.py b/devices/webcam/pose_vis/test/tests.py new file mode 100644 index 00000000..1c692bbb --- /dev/null +++ b/devices/webcam/pose_vis/test/tests.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import os +import unittest +import logging +import labgraph as lg + +from typing import List +from pose_vis.utils import absolute_path +from pose_vis.extension import PoseVisExtension, ExtensionResult +from pose_vis.pose_vis_graph import PoseVis +from pose_vis.streams.messages import CaptureResult +from pose_vis.runner import PoseVisConfig +from pose_vis.runners.source_runner import SourceStreamRunner, SourceStreamRunnerConfig +from pose_vis.extensions.hands import HandsExtension + +logger = logging.getLogger(__name__) + +class TestSubscriberConfig(lg.Config): + enabled_extensions: List[PoseVisExtension] + +class TestSubscriber(lg.Node): + INPUT = lg.Topic(CaptureResult) + config: TestSubscriberConfig + + @lg.subscriber(INPUT) + def on_message(self, message: CaptureResult) -> None: + for ext in self.config.enabled_extensions: + ext_name = ext.__class__.__name__ + assert(ext.check_output(ExtensionResult(message.extensions[0][ext_name]))) + logger.info(f" {ext_name} reported data is OK") + +class TestPoseVis(unittest.TestCase): + def test_all(self): + extensions = [HandsExtension()] + + config = PoseVisConfig( + extensions=extensions, + log_directory=absolute_path(f"webcam{os.sep}logs"), + log_name=None, + enable_logging=False, + display_framerate=0, + stats_history_size=0) + + runner_config = SourceStreamRunnerConfig( + sources=[absolute_path(f"webcam{os.sep}images")], + resolutions=[(3186, 5184, 30)]) + + runner = SourceStreamRunner(config, runner_config) + runner.build() + + PoseVis.add_node(name="TEST_SUBSCRIBER", _type=TestSubscriber, connection=["STREAM", "OUTPUT", "TEST_SUBSCRIBER", "INPUT"], config=TestSubscriberConfig(extensions)) + + runner.run() + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/devices/webcam/pose_vis/utils.py b/devices/webcam/pose_vis/utils.py new file mode 100644 index 00000000..54eb6d3e --- /dev/null +++ b/devices/webcam/pose_vis/utils.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. + +import re +import os +from pathlib import Path +from typing import List, Tuple, Union + +def absolute_path(path: str) -> str: + """ + Returns the absolute path to a file/directory from the current working directory if given a relative path + Cleans trailing seperators, and ensures the directory exists + """ + _path = path + if not _path.startswith("/") or not re.match(r'[a-zA-Z]:', _path): + _path = os.path.join(os.path.dirname(os.getcwd()), _path) + Path(os.path.dirname(_path)).mkdir(parents = True, exist_ok = True) + return _path + +def is_path(what: str): + return what.startswith("/") or re.match(r'[a-zA-Z]:', what) + +def relative_latency(cur_device_time: float, cur_receive_time: float, first_device_time: float, first_receive_time: float) -> float: + """ + Calcuate relative latency value for current set of times + """ + return (cur_receive_time - first_receive_time) - (cur_device_time - first_device_time) + +def parse_sources(input: List[str]) -> List[Union[str, int]]: + """ + Parse a list of string sources into ints or full paths to files or directories + """ + sources = [] + for arg in input: + if arg.isdigit(): + sources.append(int(arg)) + else: + if arg.find(" ! ") > -1: + # This should be a GStreamer string so we'll just pass it through + sources.append(arg) + else: + sources.append(absolute_path(arg)) + return sources + +def parse_resolutions(num_sources: int, resolutions: List[str], default_resolution: Tuple[int, int, int] = (1280, 720, 30)) -> List[Tuple[int, int, int]]: + """ + Convert a list of strings in format 'id:WxHxFPS' to a list of tuples + + Output will match `num_sources` in length. `default_resolution` will be placed where there is none provided in `resolutions` for that index + + `default_resolution` will be overridden by any entry with `*` as its id + """ + default_res = None + output = [None] * num_sources + for i in range(len(resolutions)): + colon_split = resolutions[i].split(":") + x_split = colon_split[1].split("x") + stream_id = -1 if colon_split[0] == "*" else int(colon_split[0]) + resolution = (int(x_split[0]), int(x_split[1]), int(x_split[2])) + if stream_id > -1: + output[stream_id] = resolution + else: + default_res = resolution + + if default_res is None: + default_res = default_resolution + for i in range(len(output)): + if output[i] is None: + output[i] = default_res + + return output \ No newline at end of file diff --git a/devices/webcam/readme.md b/devices/webcam/readme.md new file mode 100644 index 00000000..0d4c48db --- /dev/null +++ b/devices/webcam/readme.md @@ -0,0 +1,81 @@ +# PoseVis + +PoseVis is a LabGraph extension that streams any number of video sources and generates pose landmark data from [MediaPipe](https://google.github.io/mediapipe/) for each stream independently. MediaPipe [Hands](https://google.github.io/mediapipe/solutions/hands.html), [Face Mesh](https://google.github.io/mediapipe/solutions/face_mesh.html), [Pose](https://google.github.io/mediapipe/solutions/pose.html), and [Holistic](https://google.github.io/mediapipe/solutions/holistic.html) solutions are supported. PoseVis supports data logging and replaying via the [HDF5](https://www.hdfgroup.org/solutions/hdf5/) format. See [Using PoseVis](#using-posevis) for details. + +[Usage preview](https://i.imgur.com/FMYIy9r.mp4) + +PoseVis can also support other image processing tasks through its extension system. Take a look at the [hands extension](pose_vis/extensions/hands.py) for an example. + +# Installation + +PoseVis uses [OpenCV](https://opencv.org/) to handle video streams. Out of the box, PoseVis streams camera, video file, and image directory sources from the `MSMF` backend in Windows, and `V4L2` backend in Linux, with the MJPEG format ([see OpenCV backends here](https://docs.opencv.org/3.4/d0/da7/videoio_overview.html)). This configuration should be supported by most [UVC](https://en.wikipedia.org/wiki/USB_video_device_class) devices. Further source stream customization can be achieved by installing [GStreamer](https://gstreamer.freedesktop.org/); steps are detailed below. + +## PoseVis General Setup + +Requires Python 3.8 or later. Assuming an installation in your user directory, run `setup.py` to install required packages from PyPi: + +Linux: + + cd ~/labgraph/devices/webcam + python3 setup.py install + +Windows: + + cd %HOMEPATH%\labgraph\devices\webcam + python setup.py install + +See [Using PoseVis](#using-posevis) for usage details. + +## GStreamer Support (Optional) + +[GStreamer](https://gstreamer.freedesktop.org/) is a multimedia framework that allows you to create your own media pipelines with a simple string input. If you need more flexibility than a simple MJPEG stream, you can install GStreamer using the steps below. + +### Example GStreamer Configurations + +PoseVis expects color formats from GStreamer to be in the `BGR` color space, and OpenCV requires the use of [appsink](https://gstreamer.freedesktop.org/documentation/app/appsink.html?gi-language=c). + +Creating a test source: this configuration creates the [videotestsrc](https://gstreamer.freedesktop.org/documentation/videotestsrc/index.html?gi-language=c) element and configures a 720p @ 30Hz stream in BGR. + + python -m pose_vis.pose_vis --sources "videotestsrc ! video/x-raw, width=1280, height=720, framerate=30/1, format=BGR ! appsink" + +Creating a device source in Linux: this configuration captures an MJPEG stream at 720p @ 30Hz from a [V4L2 device](https://gstreamer.freedesktop.org/documentation/video4linux2/v4l2src.html?gi-language=c) and [converts](https://gstreamer.freedesktop.org/documentation/videoconvertscale/videoconvert.html?gi-language=c) the image format into raw BGR. + + python -m pose_vis.pose_vis --sources "v4l2src device=/dev/video0 ! image/jpeg, width=1280, height=720, framerate=30/1 ! jpegparse ! jpegdec ! videoconvert ! video/x-raw, format=BGR ! appsink" + +You can also specify [per-camera configurations](https://gstreamer.freedesktop.org/documentation/video4linux2/v4l2src.html?gi-language=c#v4l2src:extra-controls): + + ... --sources "v4l2src device=/dev/video0 extra-controls='c, exposure_auto=1' ... + +### Windows GStreamer Support + +Follow the [Windows GStreamer guide](windows_gstreamer.md). + +### Linux GStreamer Support + +Follow the [Linux GStreamer guide](linux_gstreamer.md). + +## Performance + +Performance is crucial for real time applications. Check the [benchmark notebook](benchmark.ipynb) example for performance metrics, including details of the system used for benchmarking. You can also run the notebook on your system to get an idea of how PoseVis will perform. + +## Using PoseVis + +### Test PoseVis via Command Line + +Check usage details: + + python -m pose_vis.pose_vis --help + +### Using PoseVis in Your Project + +Check the [usage guide](using_posevis.md) for an in-depth overview of the concepts used in PoseVis and how to hook into its LabGraph topics. + +### PoseVis Usage Examples + +#### GestureVis + +GestureVis uses data from the MediaPipe hand and body pose extensions to guess the current gesture based on a list of known gestures and draws the appropriate annotations onto the video stream, both online and offline. Check out the hands version [here](pose_vis/gesture/hand/readme.md). + +#### Logging Example + +The [logging example](logging_example.ipynb) notebook shows a simple way to use HDF5 logging with PoseVis. diff --git a/devices/webcam/setup.py b/devices/webcam/setup.py new file mode 100644 index 00000000..f5614188 --- /dev/null +++ b/devices/webcam/setup.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +# Copyright 2004-present Facebook. All Rights Reserved. + +import os +from setuptools import find_packages, setup + +# Use LabGraph 2.0.0 on Windows due to a package installation error +install_requires = ["labgraph==2.0.0"] if os.name == "nt" else ["labgraph>=2.0.1"] +# More information on MediaPipe installation can be found here: https://google.github.io/mediapipe/getting_started/install.html +install_requires.extend(["opencv-python>=4.6.0", "mediapipe>=0.8.11"]) + +setup( + name = "pose_vis", + version = "1.0.0", + description = "Pose visualization with LabGraph and MediaPipe", + packages = find_packages(), + python_requires = ">=3.8", + install_requires = install_requires) \ No newline at end of file diff --git a/devices/webcam/using_posevis.md b/devices/webcam/using_posevis.md new file mode 100644 index 00000000..770a987b --- /dev/null +++ b/devices/webcam/using_posevis.md @@ -0,0 +1,126 @@ +# Using PoseVis + +## Runners and Extensions + +PoseVis has utility classes called "runners" that handle graph setup and configuration. `SourceStreamRunner` runs both online and offline sources, including GStreamer sources if supported. `ReplayStreamRunner` will replay HDF5 log files, and `BenchmarkRunner` runs configured benchmarks to generate a JSON file with frame timings. + +Each runner requires a general PoseVis configuration class plus its own config. In this example, we'll configure `SourceStreamRunner` and connect to its topics with an example node. + +```python +from pose_vis.runner import PoseVisConfig +from pose_vis.extensions.hands import HandsExtension +from pose_vis.runners.source_runner import SourceStreamRunnerConfig + +config = PoseVisConfig( + # Which extensions to enable. Extension types can be found in pose_vis/extensions + extensions=[HandsExtension()], + # Directory for saving logs (if enabled). This can be relative to the current working directory or a full path + log_directory="./logs", + # Name for the HDF5 log file. Specifying None creates a random name + log_name=None, + # Enables or disables logging to HDF5 + enable_logging=False, + # How quickly to update the stream display windows. Specifying 0 disables this node. The display node calls cv2.imshow() for each stream and updates window titles appropriately + display_framerate=0, + # How many frames to keep track of for displaying performance characteristics. This is built in to the display node, so it's disabled when display_framerate is 0 + stats_history_size=0) + +runner_config = SourceStreamRunnerConfig( + # Sources dictate which streams to provide. The list can contain ints, which will stream a device via MSMF on Windows, and V4L2 on Linux + # If provided a string that points to a file, PoseVis will consider it to be a video and attempt to load that and play it at the configured resolution and framerate + # If provided a directory, PoseVis will look for images and stream those one-by-one at the configured framerate. See SUPPORTED_IMG_EXTENSIONS in pose_vis/streams/utils/capture_worker.py for a list of supported image formats + # For both video and image directory sources, PoseVis will automatically terminate the graph when the source is finished playing + # If provided a GStreamer string, PoseVis will use the GStreamer backend via OpenCV + # Learn more about how PoseVis handles streams at readme.md#installation + sources=[0], + # Resolutions are represented via a list of tuples containing 3 ints, being width, height, and framerate. This must be specified per-stream, so the list should always be the same length as the sources list + # For integer, GStreamer, and video file sources, width and height must be specified. If the input source does not match the specified resolution, it will be resized. For image directories, width and height are ignored and the image size is used + # Each stream spawns a subprocess. PoseVis will attempt to keep streams synchronized in time by requesting new frames from each source at the same time and waiting until all sources present a frame before continuing. Because of this, it always run streams at the lowest configured framerate + resolutions=[(1280, 720, 30)] +) +``` + +## LabGraph Topics + +PoseVis's graph publishes a single topic containing the result every source stream and its extension results via the `CaptureResult` message: + +```python +# pose_vis/streams/messages.py + +@dataclass +class Capture(): + """ + Represents a single video frame + + Attributes: + `frame`: `np.ndarray` video frame matrix, shape is (H, W, 3), RGB color space, short datatype + `stream_id`: `int` source index for this capture + `frame_index`: `int` total frames since startup + `system_timestamp`: `float` `time.perf_counter()` value for when this frame was created + `proc_delta_time`: `float` time in seconds this frame took to produce + `proc_runtime`: `float` total runtime for this source + `proc_fps`: `int` frames per second at the time this frame was produced + `proc_target_fps`: `int` target frames per second for this source + """ + frame: np.ndarray + stream_id: int + frame_index: int + system_timestamp: float + proc_delta_time: float + proc_runtime: float + proc_fps: int + proc_target_fps: int + +class CaptureResult(lg.Message): + """ + Represents the current frame from every capture source + + Attributes: + `captures`: `List[Capture]` `Capture`s by source index + `extensions`: `List[Dict[str, Any]]` extension data by source index + """ + captures: List[Capture] + extensions: List[Dict[str, Any]] +``` + +Extension data varies by extension, but all extensions publish their respective MediaPipe results. Multiple extensions can be enabled at the same time, and each result is listed under the extension class name in the dictionary, so in our case we'd have a `HandsExtension` entry that contains the hand screen and world keypoints, plus handedness datastructure. + +```python +message: CaptureResult ... +print(message.extensions[0]["HandsExtension"]["multi_hand_landmarks"]) +``` + +## Running the Graph and Subscribing to Topics + +Building on our earlier code, we can now subscribe to PoseVis's topic and run the graph. We'll just define a simple node to print the extension results and add it to the graph: + + +```python +import labgraph as lg +from pose_vis.pose_vis_graph import PoseVis +from pose_vis.streams.messages import CaptureResult +from pose_vis.runners.source_runner import SourceStreamRunner + +# Define a simple node for subscribing to the CaptureResult topic +class ExampleSubscriber(lg.Node): + INPUT = lg.Topic(CaptureResult) + + @lg.subscriber(INPUT) + def on_message(self, message: CaptureResult) -> None: + print(message.extensions) + +# Create the SourceStreamRunner object with our config +runner = SourceStreamRunner(config, runner_config) +# Build the PoseVis graph +runner.build() + +# Add our example node to the PoseVis graph +# This function adds a node with the name EXAMPLE_SUBSCRIBER and type ExampleSubscriber +# We then specify it should be connected to the STREAM's OUTPUT topic. All runners create a STREAM node with CaptureResult being in the OUTPUT topic slot +# We can also specify a lg.Config object to use, in this case we don't have one +PoseVis.add_node(name="EXAMPLE_SUBSCRIBER", _type=ExampleSubscriber, connection=["STREAM", "OUTPUT", "EXAMPLE_SUBSCRIBER", "INPUT"], config=None) + +# Finally, we can run the graph +runner.run() +``` + diff --git a/devices/webcam/windows_gstreamer.md b/devices/webcam/windows_gstreamer.md new file mode 100644 index 00000000..92f9e3f5 --- /dev/null +++ b/devices/webcam/windows_gstreamer.md @@ -0,0 +1,88 @@ +# Windows GStreamer Guide + +In this guide we will install GStreamer and build OpenCV with GStreamer support. This guide uses Windows 11 21H2 and assumes x64 archiecture but it should work on other Windows versions as well. + +## Install GStreamer + +Download and install the [latest development and runtime versions](https://gstreamer.freedesktop.org/download/) of GStreamer. + +## Install CMake + +Download and install the **3.24.3** release of [CMake](https://cmake.org/download/). (**Important:** OpenCV 4.6.0 is not compatible with CMake 3.25.0) + +## Install Visual Studio 2019 Build Tools + +Download and install [Visual Studio 2019 Build Tools](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019). + +## Build OpenCV + +Create a virtual environment for use with PoseVis: + + cd %HOMEPATH% && python -m venv .venv && call .venv\Scripts\activate + +We'll be building and installing OpenCV and OpenCV Contrib 4.6.0. Clone the branches: + + git clone -b 4.6.0 --single-branch https://github.com/opencv/opencv.git && git clone -b 4.6.0 --single-branch https://github.com/opencv/opencv_contrib.git + +Create a build directory for OpenCV and `cd` into it: + + md opencv\build && cd opencv\build + +Set some environment variables for Python support: this looks confusing but all it does is define `PYTHON_INCLUDE_PATH`, `PYTHON_PACKAGES_PATH`, and `PYTHON_LIB_PATH` in the current environment + + FOR /F "tokens=* USEBACKQ" %v IN (`python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())"`) do (SET PYTHON_INCLUDE_PATH=%v) && FOR /F "tokens=* USEBACKQ" %v IN (`python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"`) do (SET PYTHON_PACKAGES_PATH=%v) && FOR /F "tokens=* USEBACKQ" %v IN (`python -c "import os; import sys; import sysconfig; pyver = sysconfig.get_config_var('py_version_nodot'); print(os.path.join(sys.path[3].replace('lib', 'libs'), f'python{pyver}.lib'))"`) do (SET PYTHON_LIB_PATH=%v) + +Add GStreamer to the `PATH` variable: + + set PATH=%PATH%;%GSTREAMER_1_0_ROOT_MSVC_X86_64%bin && set PATH=%PATH%;%GSTREAMER_1_0_ROOT_MSVC_X86_64%lib && set PATH=%PATH%;%GSTREAMER_1_0_ROOT_MSVC_X86_64% + +Install Numpy: + + python -m pip install numpy + +Run CMake with the following config: + + cmake -G "Visual Studio 16 2019" -A x64 ../ -D CMAKE_CONFIGURATION_TYPES=Release -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules -D WITH_GSTREAMER=ON -D BUILD_EXAMPLES=OFF -D INSTALL_C_EXAMPLES=OFF -D INSTALL_PYTHON_EXAMPLES=OFF -D BUILD_opencv_python2=OFF -D PYTHON3_INCLUDE_DIR=%PYTHON_INCLUDE_PATH% -D PYTHON3_PACKAGES_PATH=%PYTHON_PACKAGES_PATH% -D PYTHON_LIBRARY=%PYTHON_LIB_PATH% -D BUILD_opencv_python3=ON -D WITH_VTK=OFF + +Check GStreamer and Python3 status: + + ... + + -- Video I/O: + -- GStreamer: YES (1.20.4) + + ... + + -- Python 3: + -- Interpreter: C:/Users/das/.venv/Scripts/python.exe (ver 3.10.8) + -- Libraries: C:/Python310/libs/python310.lib (ver 3.10.8) + -- numpy: C:/Users/das/.venv/lib/site-packages/numpy/core/include (ver 1.23.5) + -- install path: C:/Users/das/.venv/Lib/site-packages/cv2/python-3.10 + + ... + +If the configuration is correct, build and install OpenCV: + + cmake --build ./ --target INSTALL --config Release + +## Install MediaPipe + +Install MediaPipe from PyPi: + + python -m pip install mediapipe + +## Install LabGraph and Test + +Install LabGraph from PyPi: + + python -m pip install labgraph==2.0.0 + +`cd` into your LabGraph installation, assuming you've installed it in your home directory: + + cd %HOMEPATH%\labgraph\devices\webcam + +Make sure PoseVis with GStreamer integration works: + + python -m pose_vis.pose_vis --sources "videotestsrc ! video/x-raw, width=1280, height=720, framerate=30/1, format=BGR ! appsink" + +If all is well, you're now finished. Check [Using PoseVis](readme.md#using-posevis) for more usage examples. Enjoy using PoseVis! \ No newline at end of file