From 0b030a7b0ec56bd00d1176dc1cc26fb6d34ffdbf Mon Sep 17 00:00:00 2001 From: Max Novich Date: Tue, 29 Oct 2024 20:36:25 -0700 Subject: [PATCH] minor file management improvements --- src/goose/toolkit/io.py | 77 +++++++++++--------------- src/goose/toolkit/prompts/io.jinja | 89 ++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 61 deletions(-) diff --git a/src/goose/toolkit/io.py b/src/goose/toolkit/io.py index 4be39b0b..2c6a5b04 100644 --- a/src/goose/toolkit/io.py +++ b/src/goose/toolkit/io.py @@ -1,6 +1,6 @@ import os -import subprocess import uuid +import shutil from goose.toolkit.base import Toolkit, tool from exchange import Message @@ -14,6 +14,18 @@ def __init__(self, *args: object, **kwargs: dict[str, object]) -> None: super().__init__(*args, **kwargs) self.pyautogui = pyautogui self.screen_width, self.screen_height = self.get_screen_info().values() + self.session_dir = os.path.expanduser(".goose/screenshots") + if not os.path.exists(self.session_dir): + os.makedirs(self.session_dir) + + def __del__(self): + # Remove the entire screenshot directory + if os.path.exists(self.session_dir): + try: + shutil.rmtree(self.session_dir) + self.notifier.log(f"Removed browsing session directory: {self.session_dir}") + except OSError as e: + self.notifier.log(f"Error removing session directory: {str(e)}") @tool def get_screen_info(self): @@ -118,41 +130,6 @@ def scroll(self, clicks: int, x: int = None, y: int = None) -> str: self.pyautogui.scroll(clicks, x, y) return f"Scrolled {clicks} clicks at ({x}, {y})" - @tool - def locate_on_screen(self, image: str) -> str: - """ - Locate an image on the screen. - - Args: - image (str): The file path to the image to locate. - - Return: - (str) A message indicating whether the image was found and its position. - """ - location = self.pyautogui.locateOnScreen(image) - if location: - return f"Image found at {location}" - else: - return "Image not found on screen" - - @tool - def locate_all_on_screen(self, image: str) -> str: - """ - Locate all instances of an image on the screen. - - Args: - image (str): The file path to the image to locate. - - Return: - (str) A message indicating the positions of all instances found. - """ - locations = self.pyautogui.locateAllOnScreen(image) - locations_list = list(locations) - if locations_list: - return f"Image found at {locations_list}" - else: - return "No instances of the image found on screen" - @tool def scale_to_resolution(self, x: int, y: int, resolution: tuple[int, int]) -> tuple[int, int]: """ @@ -172,6 +149,19 @@ def scale_to_resolution(self, x: int, y: int, resolution: tuple[int, int]) -> tu new_y = int(y * scale_y) return new_x, new_y + @tool + def view_image(self, image_path: str) -> str: + """ + Allows to view any image + + Args: + image_path (str): The file path to the image to open. + + Return: + (str) A message indicating the image has been opened. + """ + return f"image:{image_path}" + @tool def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str: """ @@ -181,12 +171,8 @@ def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str: Args: max_size_mb (int): Maximum size of the screenshot in MB. Default is 5MB. """ - # Determine the path to save the screenshot - goose_dir = os.path.expanduser("~/goose_screenshots") - if not os.path.exists(goose_dir): - os.makedirs(goose_dir) - filename = os.path.join(goose_dir, f"goose_screenshot_{uuid.uuid4().hex}.jpg") + filename = os.path.join(self.session_dir, f"goose_screenshot_{uuid.uuid4().hex}.jpg") # Take a screenshot and convert to RGB screenshot = self.pyautogui.screenshot() @@ -213,7 +199,7 @@ def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str: return f"image:{filename}" @tool - def take_screenshot_and_crop(self, area_of_interest, save_path, max_size_mb=5, image_path=None): + def take_screenshot_and_crop(self, area_of_interest, save_name, max_size_mb=5, image_path=None): """ Take a screenshot (or use an existing image), crop a specified area, and return it along with the pixel coordinates of the cropped area @@ -221,13 +207,12 @@ def take_screenshot_and_crop(self, area_of_interest, save_path, max_size_mb=5, i Args: area_of_interest (tuple): A tuple (left, upper, right, lower) indicating the area to crop. - save_path (str): Path to save the cropped area image. + save_name (str): The name of the file to save the cropped image. max_size_mb (int): The maximum acceptable size of the cropped image in megabytes. image_path (str, optional): Path to an existing image file to be cropped. Returns: - tuple: Returns the cropped image path and a tuple with pixel coordinates - of the cropped area (left, upper, right, lower) relative to the full screenshot or provided image. + (tuple): Returns the cropped image path and a tuple with pixel coordinates of the cropped area (left, upper, right, lower) relative to the full screenshot or provided image. Raises: Exception: If the cropped image exceeds the specified maximum size. @@ -239,6 +224,8 @@ def take_screenshot_and_crop(self, area_of_interest, save_path, max_size_mb=5, i # Take a new screenshot of the entire screen full_screenshot = self.pyautogui.screenshot() + save_path = os.path.join(self.session_dir, save_name) + # Crop the specified area of interest and convert to RGB cropped_img = full_screenshot.crop(area_of_interest) cropped_img = cropped_img.convert('RGB') diff --git a/src/goose/toolkit/prompts/io.jinja b/src/goose/toolkit/prompts/io.jinja index 87426ac0..709ecb40 100644 --- a/src/goose/toolkit/prompts/io.jinja +++ b/src/goose/toolkit/prompts/io.jinja @@ -1,17 +1,74 @@ +General Instructions + You can move the mouse, click, right-click, type text, send keypresses, scroll, -Utilize these tools to perform actions on the screen and interact with the GUI of any application -When the user wants you to help debug, or work on a visual design by looking at their screen, IDE or browser, call the take_and_resize_screenshot or take_screenshot_and_crop and send the output from the user. -Make sure to take a screenshot before and after every action you take, even mouse movements. -Please use screenshot to check every step of the way. -Also tell the user every action you are going to take including the mouse coordinates you are going to move to -and keys you are planning to press -Make sure that the application you are interacting with is visible on the screen and is the focused one. -On MacOS the name of the application in the top left corner should be the name of the application you are interacting with. -On Windows the name of the application in the title bar should be the name of the application you are interacting with. -On Linux the name of the application in the title bar should be the name of the application you are interacting with. -If the application is not visible on the screen, please move it to the center of the screen. -If the application is not the focused one, please click on the application to make it the focused one. -If the application is not running, please start the application. -On macOs use Spotlight to search for the application and open it. -On Windows use the search bar to search for the application and open it. -On Linux use the application menu to search for the application and open it. +Utilize these tools to perform actions on the screen and interact with the GUI of any application. +When the user wants you to help debug, or work on a visual design by looking at their screen, IDE, or browser, call the take_and_resize_screenshot or take_screenshot_and_crop and send the output to the user. +Ensure to take a screenshot before and after every action, including mouse movements. + +Tool Descriptions + +get_screen_info +# Get Screen Info +Use the `get_screen_info` tool to obtain the current screen's dimensions. +Outputs the width and height as a dictionary. + +move_mouse +# Move Mouse +The `move_mouse` tool moves the cursor to specified (x, y) coordinates. +Ensure the target location is visible and the application is focused. + +click_mouse +# Click Mouse +Employ the `click_mouse` tool to perform a click action at the cursor's current position. +Verify the click target is interactable. + +right_click_mouse +# Right Click Mouse +Utilize `right_click_mouse` for a right-click action at the cursor's position. +Confirm the context menu or action associated with the right-clicking is desired. + +type_text +# Type Text +The `type_text` tool types provided text using the keyboard. +Ensure the input field or application context is correct before execution. + +press +# Press Key +Press a specified key with `press`. +Check the application’s focus and the expected behavior upon pressing the key. + +press_while_holding +# Press While Holding +Use `press_while_holding` to press keys while holding another, like shortcuts. +Ensure all keys are correctly assigned and the application supports this input. + +scroll +# Scroll +Scroll the view with `scroll`, specify direction and scroll magnitude. +Verify where within the application the scroll should occur. + +view_image +# View Image +Open an image with `view_image` to inspect screenshots or images. + +scale_to_resolution +# Scale to Resolution +Transform coordinates with `scale_to_resolution` to adapt to your display setup. +Ensure scaling is correctly computed—essential for responsive UI actions. + +take_and_resize_screenshot +# Take and Resize Screenshot +Capture screen content with `take_and_resize_screenshot` and maintain size constraints. + +take_screenshot_and_crop +# Take Screenshot and Crop +Captures a screen area specified by coordinates, or use a predefined image for cropping. +Check the output's size limits and ensure the area captures desired UI components. + +Execution and Focus Assurance + +Make sure the application you interact with is visible on the screen and is the focused one. +On macOS, the app name in the top left corner should match your target. +On Windows and Linux, the app name in the title bar should match your target. + +If the application is not visible or focused, please take the necessary steps to adjust its position and focus. If the application is not running, initiate it using the appropriate methods (Spotlight on macOS, search bar on Windows, or application menu on Linux).