add blip for captioning

mobiusml · Nov 28, 2024 · 76f6f28 · 76f6f28
1 parent a337473
commit 76f6f28
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 105 deletions.
diff --git a/aana_chat_with_video/configs/deployments.py b/aana_chat_with_video/configs/deployments.py
@@ -1,6 +1,7 @@
 from aana.core.models.sampling import SamplingParams
 from aana.core.models.types import Dtype
 from aana.deployments.vad_deployment import VadConfig, VadDeployment
+from aana.deployments.hf_blip2_deployment import HFBlip2Config, HFBlip2Deployment
 from aana.deployments.vllm_deployment import VLLMConfig, VLLMDeployment
 from aana.deployments.whisper_deployment import (
     WhisperComputeType,
@@ -40,19 +41,15 @@
     },
     {
         "name": "captioning_deployment",
-        "instance": VLLMDeployment.options(
+        "instance": HFBlip2Deployment.options(
             num_replicas=1,
+            max_ongoing_requests=1000,
             ray_actor_options={"num_gpus": 0.25},
-            user_config=VLLMConfig(
-                model="Qwen/Qwen2-VL-2B-Instruct",
-                dtype=Dtype.AUTO,
-                gpu_memory_reserved=12000,
-                max_model_len=32768,
-                enforce_eager=True,
-                default_sampling_params=SamplingParams(
-                    temperature=0.0, top_p=1.0, top_k=-1, max_tokens=512
-                ),
-                engine_args={"trust_remote_code": True},
+            user_config=HFBlip2Config(
+                model="Salesforce/blip2-opt-2.7b",
+                dtype=Dtype.FLOAT16,
+                batch_size=2,
+                num_processing_threads=2,
             ).model_dump(mode="json"),
         ),
     },

diff --git a/aana_chat_with_video/configs/settings.py b/aana_chat_with_video/configs/settings.py
@@ -5,7 +5,7 @@ class Settings(AanaSettings):
     """A pydantic model for App settings."""
 
     asr_model_name: str = "whisper_medium"
-    captioning_model_name: str = "qwen2-vl-2b-instruct"
+    captioning_model_name: str = "hf_blip2_opt_2_7b"
     max_video_len: int = 60 * 20  # 20 minutes
 
 

diff --git a/aana_chat_with_video/endpoints/index_video.py b/aana_chat_with_video/endpoints/index_video.py
@@ -162,19 +162,13 @@ async def run(  # noqa: C901
 
                 timestamps.extend(frames_dict["timestamps"])
                 frame_ids.extend(frames_dict["frame_ids"])
-                chat_prompt = "Describe the content of the following image in a single sentence:"
-                dialogs = [
-                    ImageChatDialog.from_prompt(prompt=chat_prompt, images=[frame]) for frame in frames_dict["frames"]
-                ]
-
-                # Collect the tasks to run concurrently and wait for them to finish
-                tasks = [self.captioning_handle.chat(dialog) for dialog in dialogs]
-                captioning_output = await asyncio.gather(*tasks)
-                captioning_output = [caption["message"].content for caption in captioning_output]
-                captions.extend(captioning_output)
+                captioning_output = await self.captioning_handle.generate_batch(
+                    images=frames_dict["frames"]
+                )
+                captions.extend(captioning_output["captions"])
 
                 yield {
-                    "captions": captioning_output,
+                    "captions": captioning_output["captions"],
                     "timestamps": frames_dict["timestamps"],
                 }