Skip to content

Commit

Permalink
minor file management improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Kvadratni committed Oct 30, 2024
1 parent 8ffdddc commit 0b030a7
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 61 deletions.
77 changes: 32 additions & 45 deletions src/goose/toolkit/io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import subprocess
import uuid
import shutil

from goose.toolkit.base import Toolkit, tool
from exchange import Message
Expand All @@ -14,6 +14,18 @@ def __init__(self, *args: object, **kwargs: dict[str, object]) -> None:
super().__init__(*args, **kwargs)
self.pyautogui = pyautogui
self.screen_width, self.screen_height = self.get_screen_info().values()
self.session_dir = os.path.expanduser(".goose/screenshots")
if not os.path.exists(self.session_dir):
os.makedirs(self.session_dir)

def __del__(self):
# Remove the entire screenshot directory
if os.path.exists(self.session_dir):
try:
shutil.rmtree(self.session_dir)
self.notifier.log(f"Removed browsing session directory: {self.session_dir}")
except OSError as e:
self.notifier.log(f"Error removing session directory: {str(e)}")

@tool
def get_screen_info(self):
Expand Down Expand Up @@ -118,41 +130,6 @@ def scroll(self, clicks: int, x: int = None, y: int = None) -> str:
self.pyautogui.scroll(clicks, x, y)
return f"Scrolled {clicks} clicks at ({x}, {y})"

@tool
def locate_on_screen(self, image: str) -> str:
"""
Locate an image on the screen.
Args:
image (str): The file path to the image to locate.
Return:
(str) A message indicating whether the image was found and its position.
"""
location = self.pyautogui.locateOnScreen(image)
if location:
return f"Image found at {location}"
else:
return "Image not found on screen"

@tool
def locate_all_on_screen(self, image: str) -> str:
"""
Locate all instances of an image on the screen.
Args:
image (str): The file path to the image to locate.
Return:
(str) A message indicating the positions of all instances found.
"""
locations = self.pyautogui.locateAllOnScreen(image)
locations_list = list(locations)
if locations_list:
return f"Image found at {locations_list}"
else:
return "No instances of the image found on screen"

@tool
def scale_to_resolution(self, x: int, y: int, resolution: tuple[int, int]) -> tuple[int, int]:
"""
Expand All @@ -172,6 +149,19 @@ def scale_to_resolution(self, x: int, y: int, resolution: tuple[int, int]) -> tu
new_y = int(y * scale_y)
return new_x, new_y

@tool
def view_image(self, image_path: str) -> str:
"""
Allows to view any image
Args:
image_path (str): The file path to the image to open.
Return:
(str) A message indicating the image has been opened.
"""
return f"image:{image_path}"

@tool
def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str:
"""
Expand All @@ -181,12 +171,8 @@ def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str:
Args:
max_size_mb (int): Maximum size of the screenshot in MB. Default is 5MB.
"""
# Determine the path to save the screenshot
goose_dir = os.path.expanduser("~/goose_screenshots")
if not os.path.exists(goose_dir):
os.makedirs(goose_dir)

filename = os.path.join(goose_dir, f"goose_screenshot_{uuid.uuid4().hex}.jpg")
filename = os.path.join(self.session_dir, f"goose_screenshot_{uuid.uuid4().hex}.jpg")

# Take a screenshot and convert to RGB
screenshot = self.pyautogui.screenshot()
Expand All @@ -213,21 +199,20 @@ def take_and_resize_screenshot(self, max_size_mb: int = 5) -> str:
return f"image:{filename}"

@tool
def take_screenshot_and_crop(self, area_of_interest, save_path, max_size_mb=5, image_path=None):
def take_screenshot_and_crop(self, area_of_interest, save_name, max_size_mb=5, image_path=None):
"""
Take a screenshot (or use an existing image), crop a specified area,
and return it along with the pixel coordinates of the cropped area
in the original screen size.
Args:
area_of_interest (tuple): A tuple (left, upper, right, lower) indicating the area to crop.
save_path (str): Path to save the cropped area image.
save_name (str): The name of the file to save the cropped image.
max_size_mb (int): The maximum acceptable size of the cropped image in megabytes.
image_path (str, optional): Path to an existing image file to be cropped.
Returns:
tuple: Returns the cropped image path and a tuple with pixel coordinates
of the cropped area (left, upper, right, lower) relative to the full screenshot or provided image.
(tuple): Returns the cropped image path and a tuple with pixel coordinates of the cropped area (left, upper, right, lower) relative to the full screenshot or provided image.
Raises:
Exception: If the cropped image exceeds the specified maximum size.
Expand All @@ -239,6 +224,8 @@ def take_screenshot_and_crop(self, area_of_interest, save_path, max_size_mb=5, i
# Take a new screenshot of the entire screen
full_screenshot = self.pyautogui.screenshot()

save_path = os.path.join(self.session_dir, save_name)

# Crop the specified area of interest and convert to RGB
cropped_img = full_screenshot.crop(area_of_interest)
cropped_img = cropped_img.convert('RGB')
Expand Down
89 changes: 73 additions & 16 deletions src/goose/toolkit/prompts/io.jinja
Original file line number Diff line number Diff line change
@@ -1,17 +1,74 @@
General Instructions

You can move the mouse, click, right-click, type text, send keypresses, scroll,
Utilize these tools to perform actions on the screen and interact with the GUI of any application
When the user wants you to help debug, or work on a visual design by looking at their screen, IDE or browser, call the take_and_resize_screenshot or take_screenshot_and_crop and send the output from the user.
Make sure to take a screenshot before and after every action you take, even mouse movements.
Please use screenshot to check every step of the way.
Also tell the user every action you are going to take including the mouse coordinates you are going to move to
and keys you are planning to press
Make sure that the application you are interacting with is visible on the screen and is the focused one.
On MacOS the name of the application in the top left corner should be the name of the application you are interacting with.
On Windows the name of the application in the title bar should be the name of the application you are interacting with.
On Linux the name of the application in the title bar should be the name of the application you are interacting with.
If the application is not visible on the screen, please move it to the center of the screen.
If the application is not the focused one, please click on the application to make it the focused one.
If the application is not running, please start the application.
On macOs use Spotlight to search for the application and open it.
On Windows use the search bar to search for the application and open it.
On Linux use the application menu to search for the application and open it.
Utilize these tools to perform actions on the screen and interact with the GUI of any application.
When the user wants you to help debug, or work on a visual design by looking at their screen, IDE, or browser, call the take_and_resize_screenshot or take_screenshot_and_crop and send the output to the user.
Ensure to take a screenshot before and after every action, including mouse movements.

Tool Descriptions

get_screen_info
# Get Screen Info
Use the `get_screen_info` tool to obtain the current screen's dimensions.
Outputs the width and height as a dictionary.

move_mouse
# Move Mouse
The `move_mouse` tool moves the cursor to specified (x, y) coordinates.
Ensure the target location is visible and the application is focused.

click_mouse
# Click Mouse
Employ the `click_mouse` tool to perform a click action at the cursor's current position.
Verify the click target is interactable.

right_click_mouse
# Right Click Mouse
Utilize `right_click_mouse` for a right-click action at the cursor's position.
Confirm the context menu or action associated with the right-clicking is desired.

type_text
# Type Text
The `type_text` tool types provided text using the keyboard.
Ensure the input field or application context is correct before execution.

press
# Press Key
Press a specified key with `press`.
Check the application’s focus and the expected behavior upon pressing the key.

press_while_holding
# Press While Holding
Use `press_while_holding` to press keys while holding another, like shortcuts.
Ensure all keys are correctly assigned and the application supports this input.

scroll
# Scroll
Scroll the view with `scroll`, specify direction and scroll magnitude.
Verify where within the application the scroll should occur.

view_image
# View Image
Open an image with `view_image` to inspect screenshots or images.

scale_to_resolution
# Scale to Resolution
Transform coordinates with `scale_to_resolution` to adapt to your display setup.
Ensure scaling is correctly computed—essential for responsive UI actions.

take_and_resize_screenshot
# Take and Resize Screenshot
Capture screen content with `take_and_resize_screenshot` and maintain size constraints.

take_screenshot_and_crop
# Take Screenshot and Crop
Captures a screen area specified by coordinates, or use a predefined image for cropping.
Check the output's size limits and ensure the area captures desired UI components.

Execution and Focus Assurance

Make sure the application you interact with is visible on the screen and is the focused one.
On macOS, the app name in the top left corner should match your target.
On Windows and Linux, the app name in the title bar should match your target.

If the application is not visible or focused, please take the necessary steps to adjust its position and focus. If the application is not running, initiate it using the appropriate methods (Spotlight on macOS, search bar on Windows, or application menu on Linux).

0 comments on commit 0b030a7

Please sign in to comment.