[FEAT][Flow.run() img = None for conditional img inputs, BaseMultiMod…

…alModel, and multi modal swarms of manufacturing agents
kyegomez · Nov 25, 2023 · a92a6a5 · a92a6a5
1 parent f895497
commit a92a6a5
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 28 deletions.
diff --git a/example.py b/example.py
@@ -1,9 +1,21 @@
+import os
+
+from dotenv import load_dotenv
+
+# Import the OpenAIChat model and the Flow struct
 from swarms.models import OpenAIChat
 from swarms.structs import Flow
 
+# Load the environment variables
+load_dotenv()
+
+# Get the API key from the environment
+api_key = os.environ.get("OPENAI_API_KEY")
+
 # Initialize the language model
 llm = OpenAIChat(
  temperature=0.5,
+ openai_api_key=api_key,
 )
 
 

diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py
@@ -1,5 +1,8 @@
 from swarms.structs import Flow
 from swarms.models.gpt4_vision_api import GPT4VisionAPI
+from swarms.prompts.multi_modal_autonomous_instruction_prompt import (
+ MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
+)
 
 
 llm = GPT4VisionAPI()
@@ -10,6 +13,7 @@
 ## Initialize the workflow
 flow = Flow(
  llm=llm,
+ sop=MULTI_MODAL_AUTO_AGENT_SYSTEM_PROMPT_1,
  max_loops="auto",
 )
 

diff --git a/playground/demos/idea_2_img/main.py b/playground/demos/idea_2_img/main.py
@@ -0,0 +1,7 @@
+"""
+Idea 2 img
+
+task -> gpt4 text -> dalle3 img -> gpt4vision img + text analyze img -> dalle3 img -> loop 
+ 
+"""
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
diff --git a/playground/demos/swarm_of_mma_manufacturing/main.py b/playground/demos/swarm_of_mma_manufacturing/main.py
@@ -0,0 +1,15 @@
+"""
+Swarm of multi modal autonomous agents for manufacturing!
+--------------------------------------------------------- 
+Health Security agent: Agent that monitors the health of working conditions: input image of factory output: health safety index 0.0 - 1.0 being the highest
+Quality Control agent: Agent that monitors the quality of the product: input image of product output: quality index 0.0 - 1.0 being the highest
+Productivity agent: Agent that monitors the productivity of the factory: input image of factory output: productivity index 0.0 - 1.0 being the highest
+Safety agent: Agent that monitors the safety of the factory: input image of factory output: safety index 0.0 - 1.0 being the highest
+Security agent: Agent that monitors the security of the factory: input image of factory output: security index 0.0 - 1.0 being the highest
+Sustainability agent: Agent that monitors the sustainability of the factory: input image of factory output: sustainability index 0.0 - 1.0 being the highest
+Efficiency agent: Agent that monitors the efficiency of the factory: input image of factory output: efficiency index 0.0 - 1.0 being the highest 
+
+
+Flow:
+health security agent -> quality control agent -> productivity agent -> safety agent -> security agent -> sustainability agent -> efficiency agent 
+"""
diff --git a/swarms/models/base_multimodal_model.py b/swarms/models/base_multimodal_model.py
@@ -1,3 +1,4 @@
+from abc import abstractmethod
 import asyncio
 import base64
 import concurrent.futures
@@ -7,8 +8,8 @@
 from typing import List, Optional, Tuple
 
 import requests
-from ABC import abstractmethod
 from PIL import Image
+from termcolor import colored
 
 
 class BaseMultiModalModel:
@@ -37,7 +38,6 @@ def __init__(
  self.retries = retries
  self.chat_history = []
 
-
  @abstractmethod
  def __call__(self, text: str, img: str):
  """Run the model"""
@@ -61,17 +61,17 @@ def get_img_from_web(self, img: str):
  except requests.RequestException as error:
  print(f"Error fetching image from {img} and error: {error}")
  return None
- 
+
  def encode_img(self, img: str):
  """Encode the image to base64"""
  with open(img, "rb") as image_file:
  return base64.b64encode(image_file.read()).decode("utf-8")
- 
+
  def get_img(self, img: str):
  """Get the image from the path"""
  image_pil = Image.open(img)
  return image_pil
- 
+
  def clear_chat_history(self):
  """Clear the chat history"""
  self.chat_history = []
@@ -87,11 +87,11 @@ def run_many(
  Args:
  tasks (List[str]): List of tasks
  imgs (List[str]): List of image paths
- 
+
  Returns:
  List[str]: List of responses
- 
- 
+
+
  """
  # Instantiate the thread pool executor
  with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
@@ -101,7 +101,6 @@ def run_many(
  for result in results:
  print(result)
 
-
  def run_batch(self, tasks_images: List[Tuple[str, str]]) -> List[str]:
  """Process a batch of tasks and images"""
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -133,11 +132,11 @@ async def run_batch_async_with_retries(
  for task, img in tasks_images
  ]
  return await asyncio.gather(*futures)
- 
+
  def unique_chat_history(self):
  """Get the unique chat history"""
  return list(set(self.chat_history))
- 
+
  def run_with_retries(self, task: str, img: str):
  """Run the model with retries"""
  for i in range(self.retries):
@@ -146,7 +145,7 @@ def run_with_retries(self, task: str, img: str):
  except Exception as error:
  print(f"Error with the request {error}")
  continue
- 
+
  def run_batch_with_retries(self, tasks_images: List[Tuple[str, str]]):
  """Run the model with retries"""
  for i in range(self.retries):
@@ -188,28 +187,37 @@ def get_generation_time(self) -> float:
  if self.start_time and self.end_time:
  return self.end_time - self.start_time
  return 0
- 
+
  def get_chat_history(self):
  """Get the chat history"""
  return self.chat_history
- 
+
  def get_unique_chat_history(self):
  """Get the unique chat history"""
  return list(set(self.chat_history))
- 
+
  def get_chat_history_length(self):
  """Get the chat history length"""
  return len(self.chat_history)
- 
+
  def get_unique_chat_history_length(self):
  """Get the unique chat history length"""
  return len(list(set(self.chat_history)))
- 
+
  def get_chat_history_tokens(self):
  """Get the chat history tokens"""
  return self._num_tokens()
- 
+
  def print_beautiful(self, content: str, color: str = "cyan"):
  """Print Beautifully with termcolor"""
  content = colored(content, color)
- print(content)
+ print(content)
+
+ def stream(self, content: str):
+ """Stream the output
+
+ Args:
+ content (str): _description_
+ """
+ for chunk in content:
+ print(chunk)
diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py
@@ -1,6 +1,7 @@
-import logging 
+import logging
 import asyncio
 import base64
+from typing import Optional
 import concurrent.futures
 from termcolor import colored
 import json
@@ -12,6 +13,13 @@
 import requests
 from dotenv import load_dotenv
 
+
+try:
+ import cv2
+except ImportError:
+ print("OpenCV not installed. Please install OpenCV to use this model.")
+ raise ImportError
+
 # Load environment variables
 load_dotenv()
 openai_api_key = os.getenv("OPENAI_API_KEY")
@@ -59,7 +67,8 @@ def __init__(
  max_workers: int = 10,
  max_tokens: str = 300,
  openai_proxy: str = "https://api.openai.com/v1/chat/completions",
- beautify: bool = False
+ beautify: bool = False,
+ streaming_enabled: Optional[bool] = False,
  ):
  super().__init__()
  self.openai_api_key = openai_api_key
@@ -69,6 +78,7 @@ def __init__(
  self.max_tokens = max_tokens
  self.openai_proxy = openai_proxy
  self.beautify = beautify
+ self.streaming_enabled = streaming_enabled
 
  if self.logging_enabled:
  logging.basicConfig(level=logging.DEBUG)
@@ -123,14 +133,101 @@ def run(self, task: str, img: str):
  out = response.json()
  content = out["choices"][0]["message"]["content"]
 
+ if self.streaming_enabled:
+ content = self.stream_response(content)
+ else:
+ pass
+
  if self.beautify:
  content = colored(content, "cyan")
+ print(content)
  else:
  print(content)
+
  except Exception as error:
  print(f"Error with the request: {error}")
  raise error
 
+ def video_prompt(self, frames):
+ """
+ SystemPrompt is a class that generates a prompt for the user to respond to.
+ The prompt is generated based on the current state of the system.
+
+ Parameters
+ ----------
+ frames : list
+ A list of base64 frames
+
+ Returns
+ -------
+ PROMPT : str
+ The system prompt
+
+ Examples
+ --------
+
+ >>> from swarms.models import GPT4VisionAPI
+ >>> llm = GPT4VisionAPI()
+ >>> video = "video.mp4"
+ >>> base64_frames = llm.process_video(video)
+ >>> prompt = llm.video_prompt(base64_frames)
+ >>> print(prompt)
+
+ """
+ PROMPT = f"""
+ These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
+ 
+ {frames}
+ """
+ return PROMPT
+
+ def stream_response(self, content: str):
+ """Stream the response of the output
+
+ Args:
+ content (str): _description_
+ """
+ for chunk in content:
+ print(chunk)
+
+ def process_video(self, video: str):
+ """
+ Process a video into a list of base64 frames
+
+ Parameters
+ ----------
+ video : str
+ The path to the video file
+
+ Returns
+ -------
+ base64_frames : list
+ A list of base64 frames
+
+ Examples
+ --------
+ >>> from swarms.models import GPT4VisionAPI
+ >>> llm = GPT4VisionAPI()
+ >>> video = "video.mp4"
+ >>> base64_frames = llm.process_video(video)
+
+ """
+ video = cv2.VideoCapture(video)
+
+ base64_frames = []
+ while video.isOpened():
+ success, frame = video.read()
+ if not success:
+ break
+ _, buffer = cv2.imencode(".jpg", frame)
+ base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
+
+ video.release()
+ print(len(base64_frames), "frames read.")
+
+ for img in base64_frames:
+ base64.b64decode(img.encode("utf-8"))
+
  def __call__(self, task: str, img: str):
  """Run the model."""
  try:
@@ -168,10 +265,17 @@ def __call__(self, task: str, img: str):
  out = response.json()
  content = out["choices"][0]["message"]["content"]
 
+ if self.streaming_enabled:
+ content = self.stream_response(content)
+ else:
+ pass
+
  if self.beautify:
  content = colored(content, "cyan")
+ print(content)
  else:
  print(content)
+
  except Exception as error:
  print(f"Error with the request: {error}")
  raise error

diff --git a/swarms/models/kosmos_two.py b/swarms/models/kosmos_two.py
@@ -24,7 +24,7 @@ class Kosmos:
  ----------
  model_name : str
  Path to the pretrained model
- 
+
  Examples
  --------
  >>> kosmos = Kosmos()

diff --git a/swarms/models/whisperx_model.py b/swarms/models/whisperx_model.py
@@ -99,7 +99,9 @@ def transcribe_youtube_video(self):
  print("The key 'segments' is not found in the result.")
 
  def transcribe(self, audio_file):
- model = whisperx_model.load_model("large-v2", self.device, self.compute_type)
+ model = whisperx_model.load_model(
+ "large-v2", self.device, self.compute_type
+ )
  audio = whisperx_model.load_audio(audio_file)
  result = model.transcribe(audio, batch_size=self.batch_size)