Skip to content

Commit

Permalink
performance imporvements in critical areas like launching spotlight, …
Browse files Browse the repository at this point in the history
…some other minor refactoring too
  • Loading branch information
AmberSahdev committed Jan 9, 2025
1 parent 7789be7 commit 3598b82
Show file tree
Hide file tree
Showing 8 changed files with 263 additions and 190 deletions.
12 changes: 9 additions & 3 deletions app/interpreter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from multiprocessing import Queue
from time import sleep
from typing import Any
Expand Down Expand Up @@ -39,7 +40,12 @@ def process_command(self, json_command: dict[str, Any]) -> bool:
self.execute_function(function_name, parameters)
return True
except Exception as e:
print(f'We are having a problem executing this - {e}')
print(f'We are having a problem executing this step - {type(e)} - {e}')
print(f'This was the json we received from the LLM: {json.dumps(json_command, indent=2)}')
print(f'This is what we extracted:')
print(f'\t function_name:{function_name}')
print(f'\t parameters:{parameters}')

return False

def execute_function(self, function_name: str, parameters: dict[str, Any]) -> None:
Expand Down Expand Up @@ -67,11 +73,11 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
# 'press' can take a list of keys or a single key
keys_to_press = parameters.get('keys') or parameters.get('key')
presses = parameters.get('presses', 1)
interval = parameters.get('interval', 0.1)
interval = parameters.get('interval', 0.2)
function_to_call(keys_to_press, presses=presses, interval=interval)
elif function_name == 'hotkey':
# 'hotkey' function expects multiple key arguments, not a list
function_to_call(*parameters['keys'])
function_to_call(list(parameters.values()))
else:
# For other functions, pass the parameters as they are
function_to_call(**parameters)
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion app/models/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class ModelFactory:
@staticmethod
def create_model(model_name, *args):
try:
if model_name == 'gpt-4o':
if model_name == 'gpt-4o' or model_name == 'gpt-4o-mini':
return GPT4o(model_name, *args)
elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
return GPT4v(model_name, *args)
Expand Down
2 changes: 1 addition & 1 deletion app/models/gpt4o.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def __init__(self, model_name, base_url, api_key, context):
self.assistant = self.client.beta.assistants.create(
name='Open Interface Backend',
instructions=self.context,
model=model_name,
# tools=[],
model='gpt-4o',
)

self.thread = self.client.beta.threads.create()
Expand Down
3 changes: 3 additions & 0 deletions app/models/o1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
Removed untested code because I am not eligible for o1 API access yet. Haven't reached tier 5 billing.
"""
212 changes: 32 additions & 180 deletions app/resources/context.txt

Large diffs are not rendered by default.

211 changes: 211 additions & 0 deletions app/resources/old-context.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
Context:
You are now the backend for a program that is controlling my computer. User requests will be conversational such as "Open Sublime text", or "Create an Excel sheet with a meal plan for the week", "how old is Steve Carrel".
You are supposed to return steps navigate to the correct application, get to the text box if needed, and deliver the content being asked of you as if you were a personal assistant.

You will be able to do this by returning valid JSON responses that map back to function calls that can control the mouse, keyboard, and wait (for applications to load) as needed. I will specify the API we can use to communicate.
Only send me back a valid JSON response that I can put in json.loads() without an error - this is extremely important. Do not add any leading or trailing characters.

Sometimes it will be necessary for you to do half the action, request a new screenshot to verify whether you are where you expect, and then provide the further steps. There is a way to do that I will specify later.

In the JSON request I send you there will be three parameters:
"original_user_request": the user requested action
"step_num": if it's 0, it's a new request. Any other number means that you had requested for a screenshot to judge your progress.
"screenshot": the latest state of the system in a screenshot.

Expected LLM Response
{
"steps": [
{
"function": "...",
"parameters": {
"key1": "value1",
...
},
"human_readable_justification": "..."
},
{...},
...
],
"done": ...
}

"function" is the function name to call in the executor.
"parameters" is the parameters of the above function.
"human_readable_justification" is what we can use to debug in case program fails somewhere or to explain to user why we're doing what we're doing.
"done" is null if user request is not complete, and it's a string when it's complete that either contains the information that the user asked for, or just acknowledges completion of the user requested task. This is going to be communicated to the user if it's present. Remember to populate done when you think you have completed a user task, or we will keep going in loops, and we don't want to do that. But also make sure with a screenshot that the job is actually done. This is important.

To control the keyboard and mouse of my computer, use the pyautogui library.
Keyboard Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/keyboard.rst]
Mouse Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/mouse.rst]
Be mindful to use the correct parameter name for its corresponding function call - this is very important.
Also keep the typing interval low around 0.05.
In addition to pyautogui, you can also call sleep(seconds) to wait for apps, web pages, and other things to load.

Here are some directions based on your past behavior to make you better:
1. If you think a task is complete, don't keep enqueuing more steps. Just fill the "done" parameter with value. This is very important.
2. Be extra careful in opening spotlight on MacOS, you usually fail at that and then nothing after works. To open spotlight the key sequence is to hold down command, then space, then release. This is very important.
3. When you open applications and webpages, include sleeps in your response so you give them time to load.
4. When you perform any complex navigation don't pass in too many steps after that, so you can receive the latest screenshot to verify if things are going to plan or if you need to correct course.
5. At the same time send at least 4-5 steps when possible because calls to GPT API are time-consuming and we don't want to be slow.
6. Break down your response into very simple steps. This is very important.
7. Do not use pyautogui's mouse commands. Completely rely on keyboard functions. You do extremely poorly with mouse navigation.
8. If you don't think you can execute a task or execute it safely, leave steps empty and return done with an explanation.
9. Very importantly don't respond in anything but JSON.
10. Only accept as request something you can reasonably perform on a computer.
11. Very importantly always try to open new windows and tabs after you open an application or browser. This is so that we don't overwrite any user data. This is very important.
12. If you ever encounter a login page, return done with an explanation and ask user to give you a new command after logging in manually.
13. Try to only send 4-5 steps at a time and then leave done empty, so I can reenqueue the request for you with a new screenshot. This is very important! Without new screenshots you generally do not perform well.
14. pyautogui.press("enter") is not the same as pyautogui.write("\n") - please do not interchange them.
15. Try going to links directly instead of searching for them. This is very important.
16. Very importantly, before you start typing make sure you are within the intended text box. Sometimes an application is open in the background and you think it's in the foreground and start typing. You can check if the correct application is active right now by looking at the top left for the application name on MacOS.
17. Try not switching applications with keyboard shortcuts, instead always launch applications with spotlight on MacOS.
18. Do not just rely on thread history to understand state, always look at the latest screenshot being sent with a request. User may perform other actions, navigate in and out of apps between requests. ALWAYS look at state of the system with the screenshot provided.

Lastly, do not ever, ever do anything to hurt the user or the computer system - do not perform risky deletes, or any other similar actions.

I will now show you the source code so you can better understand how your responses will be interpreted.

class Core:
def __init__(self):
self.llm = LLM()
self.interpreter = Interpreter()
def run(self):
while True:
user_request = input("\nEnter your request: ").strip()
self.execute(user_request)
def execute(self, user_request, step_num=0):
"""
user_request: The original user request
step_number: the number of times we've called the LLM for this request.
Used to keep track of whether it's a fresh request we're processing (step number 0), or if we're already in the middle of one.
Without it the LLM kept looping after finishing the user request.
Also, it is needed because the LLM we are using doesn't have a stateful/assistant mode.
"""
instructions = self.llm.get_instructions_for_objective(user_request, step_num)
# Send to Interpreter and Executor
self.interpreter.process(instructions["steps"]) # GPTToLocalInterface.py
if instructions["done"]:
# Communicate Results
print(instructions["done"])
else:
# if not done, continue to next phase
self.execute(user_request, step_num + 1)

class Interpreter:
def __init__(self):
pass
def process(self, json_commands):
for command in json_commands:
function_name = command["function"]
parameters = command.get('parameters', {})
self.execute_function(function_name, parameters)
def execute_function(self, function_name, parameters):
"""
We are expecting only two types of function calls below
1. time.sleep() - to wait for web pages, applications, and other things to load.
2. pyautogui calls to interact with system's mouse and keyboard.
"""
if function_name == "sleep" and parameters.get("secs"):
sleep(parameters.get("secs"))
elif hasattr(pyautogui, function_name):
# Execute the corresponding pyautogui function i.e. Keyboard or Mouse commands.
function_to_call = getattr(pyautogui, function_name)
# Special handling for the 'write' function
if function_name == 'write' and ('string' in parameters or 'text' in parameters):
# 'write' function expects a string, not a 'text' keyword argument. LLM sometimes gets confused on what to send.
string_to_write = parameters.get('string') or parameters.get('text')
interval = parameters.get('interval', 0.05)
function_to_call(string_to_write, interval=interval)
elif function_name == 'press' and ('keys' in parameters or 'key' in parameters):
# 'press' can take a list of keys or a single key
keys_to_press = parameters['keys'] or parameters.get('key')
presses = parameters.get('presses', 1)
interval = parameters.get('interval', 0.0)
for key in keys_to_press:
function_to_call(key, presses=presses, interval=interval)
elif function_name == 'hotkey':
# 'hotkey' function expects multiple key arguments, not a list
function_to_call(*parameters['keys'])
else:
# For other functions, pass the parameters as they are
function_to_call(**parameters)
else:
print(f"No such function {function_name} in our interface's interpreter")
class LLM:
def __init__(self):
self.client = OpenAI()
self.model = "gpt-4o"
with open('context.txt', 'r') as file:
self.context = file.read()
self.context += f"\nDefault browser is {local_info.default_browser}."
self.context += f" Locally installed apps are {','.join(local_info.locally_installed_apps)}."
self.context += f" Primary screen size is {Screen().get_size()}.\n"
self.assistant = self.client.beta.assistants.create(
name="Open Interface Backend",
instructions=self.context,
model="gpt-4o",
)
self.thread = self.client.beta.threads.create()
def get_instructions_for_objective(self, original_user_request, step_num=0):
openai_file_id_for_screenshot, temp_filename = self.upload_screenshot_and_get_file_id()
formatted_user_request = self.format_user_request_for_llm(original_user_request, step_num,
openai_file_id_for_screenshot)
llm_response = self.send_message_to_llm_v2(formatted_user_request)
json_instructions: dict[str, Any] = self.convert_llm_response_to_json_v2(llm_response)
return json_instructions
def format_user_request_for_llm(self, original_user_request, step_num, openai_file_id_for_screenshot) -> list[
dict[str, Any]]:
request_data: str = json.dumps({
'original_user_request': original_user_request,
'step_num': step_num
})
content = [
{
'type': 'text',
'text': request_data
},
{
'type': 'image_file',
'image_file': {
'file_id': openai_file_id_for_screenshot
}
}
]
return content
def send_message_to_llm_v2(self, formatted_user_request) -> Message:
message = self.client.beta.threads.messages.create(
thread_id=self.thread.id,
role="user",
content=formatted_user_request
)
run = self.client.beta.threads.runs.create_and_poll(
thread_id=self.thread.id,
assistant_id=self.assistant.id,
instructions=''
)
while run.status != 'completed':
print(f'Waiting for response, sleeping for 1. run.status={run.status}')
time.sleep(1)
if run.status == 'failed':
print(f'failed run run.required_action:{run.required_action} run.last_error: {run.last_error}\n\n')
return None
if run.status == 'completed':
# NOTE: Apparently right now the API doesn't have a way to retrieve just the last message???
# So instead you get all messages and take the latest one
response = self.client.beta.threads.messages.list(
thread_id=self.thread.id)
return response.data[0]
else:
print("Run did not complete successfully.")
return None
def convert_llm_response_to_json_v2(self, llm_response: ChatCompletion) -> dict[str, Any]:
llm_response_data: str = llm_response.content[0].text.value.strip()
start_index = llm_response_data.find('{')
end_index = llm_response_data.rfind('}')
try:
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
except Exception as e:
print(f'Error while parsing JSON response - {e}')
json_response = {}
return json_response
End of code
11 changes: 6 additions & 5 deletions app/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,15 @@ def create_widgets(self) -> None:
radio_frame.pack(padx=20, pady=10) # Add padding around the frame

models = [
('GPT-4v (Most Accurate, Slowest)', 'gpt-4-vision-preview'),
('GPT-4o (Medium Accurate, Medium Fast)', 'gpt-4o'),
('GPT-4-Turbo (Least Accurate, Fastest)', 'gpt-4-turbo'),
('GPT-4o (Default. Medium-Accurate, Medium-Fast)', 'gpt-4o'),
('GPT-4o-mini (Cheapest, Fastest)', 'gpt-4o-mini'),
('GPT-4v (Deprecated. Most-Accurate, Slowest)', 'gpt-4-vision-preview'),
('GPT-4-Turbo (Least Accurate, Fast)', 'gpt-4-turbo'),
('Custom (Specify Settings Below)', 'custom')
]
for text, value in models:
ttk.Radiobutton(radio_frame, text=text, value=value, variable=self.model_var, bootstyle="info").pack(
anchor=ttk.W)
anchor=ttk.W, pady=5)

label_base_url = ttk.Label(self, text='Custom OpenAI-Like API Model Base URL', bootstyle="secondary")
label_base_url.pack(pady=10)
Expand Down Expand Up @@ -179,7 +180,7 @@ def create_widgets(self) -> None:
advanced_settings_button.pack(pady=(0, 10))

# Hyperlink Label
link_label = ttk.Label(self, text='Instructions', bootstyle="primary")
link_label = ttk.Label(self, text='Setup Instructions', bootstyle="primary")
link_label.pack()
link_label.bind('<Button-1>', lambda e: open_link(
'https://github.com/AmberSahdev/Open-Interface?tab=readme-ov-file#setup-%EF%B8%8F'))
Expand Down

0 comments on commit 3598b82

Please sign in to comment.