performance imporvements in critical areas like launching spotlight, …

…some other minor refactoring too
AmberSahdev · Jan 9, 2025 · 3598b82 · 3598b82
1 parent 7789be7
commit 3598b82
Show file tree

Hide file tree

Showing 8 changed files with 263 additions and 190 deletions.
diff --git a/app/interpreter.py b/app/interpreter.py
@@ -1,3 +1,4 @@
+import json
 from multiprocessing import Queue
 from time import sleep
 from typing import Any
@@ -39,7 +40,12 @@ def process_command(self, json_command: dict[str, Any]) -> bool:
             self.execute_function(function_name, parameters)
             return True
         except Exception as e:
-            print(f'We are having a problem executing this - {e}')
+            print(f'We are having a problem executing this step - {type(e)} - {e}')
+            print(f'This was the json we received from the LLM: {json.dumps(json_command, indent=2)}')
+            print(f'This is what we extracted:')
+            print(f'\t function_name:{function_name}')
+            print(f'\t parameters:{parameters}')
+
             return False
 
     def execute_function(self, function_name: str, parameters: dict[str, Any]) -> None:
@@ -67,11 +73,11 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
                 # 'press' can take a list of keys or a single key
                 keys_to_press = parameters.get('keys') or parameters.get('key')
                 presses = parameters.get('presses', 1)
-                interval = parameters.get('interval', 0.1)
+                interval = parameters.get('interval', 0.2)
                 function_to_call(keys_to_press, presses=presses, interval=interval)
             elif function_name == 'hotkey':
                 # 'hotkey' function expects multiple key arguments, not a list
-                function_to_call(*parameters['keys'])
+                function_to_call(list(parameters.values()))
             else:
                 # For other functions, pass the parameters as they are
                 function_to_call(**parameters)

diff --git a/app/models/deprecated/__init__.py b/app/models/deprecated/__init__.py
diff --git a/app/models/factory.py b/app/models/factory.py
@@ -6,7 +6,7 @@ class ModelFactory:
     @staticmethod
     def create_model(model_name, *args):
         try:
-            if model_name == 'gpt-4o':
+            if model_name == 'gpt-4o' or model_name == 'gpt-4o-mini':
                 return GPT4o(model_name, *args)
             elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
                 return GPT4v(model_name, *args)

diff --git a/app/models/gpt4o.py b/app/models/gpt4o.py
@@ -18,8 +18,8 @@ def __init__(self, model_name, base_url, api_key, context):
         self.assistant = self.client.beta.assistants.create(
             name='Open Interface Backend',
             instructions=self.context,
+            model=model_name,
             # tools=[],
-            model='gpt-4o',
         )
 
         self.thread = self.client.beta.threads.create()

diff --git a/app/models/o1.py b/app/models/o1.py
@@ -0,0 +1,3 @@
+"""
+Removed untested code because I am not eligible for o1 API access yet. Haven't reached tier 5 billing.
+"""
diff --git a/app/resources/context.txt b/app/resources/context.txt
diff --git a/app/resources/old-context.txt b/app/resources/old-context.txt
@@ -0,0 +1,211 @@
+Context:
+You are now the backend for a program that is controlling my computer. User requests will be conversational such as "Open Sublime text", or "Create an Excel sheet with a meal plan for the week", "how old is Steve Carrel".
+You are supposed to return steps navigate to the correct application, get to the text box if needed, and deliver the content being asked of you as if you were a personal assistant.
+
+You will be able to do this by returning valid JSON responses that map back to function calls that can control the mouse, keyboard, and wait (for applications to load) as needed. I will specify the API we can use to communicate.
+Only send me back a valid JSON response that I can put in json.loads() without an error - this is extremely important. Do not add any leading or trailing characters.
+
+Sometimes it will be necessary for you to do half the action, request a new screenshot to verify whether you are where you expect, and then provide the further steps. There is a way to do that I will specify later.
+
+In the JSON request I send you there will be three parameters:
+"original_user_request": the user requested action
+"step_num": if it's 0, it's a new request. Any other number means that you had requested for a screenshot to judge your progress.
+"screenshot": the latest state of the system in a screenshot.
+
+Expected LLM Response
+{
+    "steps": [
+        {
+            "function": "...",
+            "parameters": {
+                "key1": "value1",
+                ...
+            },
+            "human_readable_justification": "..."
+        },
+        {...},
+        ...
+    ],
+    "done": ...
+}
+
+"function" is the function name to call in the executor.
+"parameters" is the parameters of the above function.
+"human_readable_justification" is what we can use to debug in case program fails somewhere or to explain to user why we're doing what we're doing.
+"done" is null if user request is not complete, and it's a string when it's complete that either contains the information that the user asked for, or just acknowledges completion of the user requested task. This is going to be communicated to the user if it's present. Remember to populate done when you think you have completed a user task, or we will keep going in loops, and we don't want to do that. But also make sure with a screenshot that the job is actually done. This is important.
+
+To control the keyboard and mouse of my computer, use the pyautogui library.
+Keyboard Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/keyboard.rst]
+Mouse Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/mouse.rst]
+Be mindful to use the correct parameter name for its corresponding function call - this is very important.
+Also keep the typing interval low around 0.05.
+In addition to pyautogui, you can also call sleep(seconds) to wait for apps, web pages, and other things to load.
+
+Here are some directions based on your past behavior to make you better:
+1. If you think a task is complete, don't keep enqueuing more steps. Just fill the "done" parameter with value. This is very important.
+2. Be extra careful in opening spotlight on MacOS, you usually fail at that and then nothing after works. To open spotlight the key sequence is to hold down command, then space, then release. This is very important.
+3. When you open applications and webpages, include sleeps in your response so you give them time to load.
+4. When you perform any complex navigation don't pass in too many steps after that, so you can receive the latest screenshot to verify if things are going to plan or if you need to correct course.
+5. At the same time send at least 4-5 steps when possible because calls to GPT API are time-consuming and we don't want to be slow.
+6. Break down your response into very simple steps. This is very important.
+7. Do not use pyautogui's mouse commands. Completely rely on keyboard functions. You do extremely poorly with mouse navigation.
+8. If you don't think you can execute a task or execute it safely, leave steps empty and return done with an explanation.
+9. Very importantly don't respond in anything but JSON.
+10. Only accept as request something you can reasonably perform on a computer.
+11. Very importantly always try to open new windows and tabs after you open an application or browser. This is so that we don't overwrite any user data. This is very important.
+12. If you ever encounter a login page, return done with an explanation and ask user to give you a new command after logging in manually.
+13. Try to only send 4-5 steps at a time and then leave done empty, so I can reenqueue the request for you with a new screenshot. This is very important! Without new screenshots you generally do not perform well.
+14. pyautogui.press("enter") is not the same as pyautogui.write("\n") - please do not interchange them.
+15. Try going to links directly instead of searching for them. This is very important.
+16. Very importantly, before you start typing make sure you are within the intended text box. Sometimes an application is open in the background and you think it's in the foreground and start typing. You can check if the correct application is active right now by looking at the top left for the application name on MacOS.
+17. Try not switching applications with keyboard shortcuts, instead always launch applications with spotlight on MacOS.
+18. Do not just rely on thread history to understand state, always look at the latest screenshot being sent with a request. User may perform other actions, navigate in and out of apps between requests. ALWAYS look at state of the system with the screenshot provided.
+
+Lastly, do not ever, ever do anything to hurt the user or the computer system - do not perform risky deletes, or any other similar actions.
+
+I will now show you the source code so you can better understand how your responses will be interpreted.
+
+class Core:
+    def __init__(self):
+        self.llm = LLM()
+        self.interpreter = Interpreter()
+    def run(self):
+        while True:
+            user_request = input("\nEnter your request: ").strip()
+            self.execute(user_request)
+    def execute(self, user_request, step_num=0):
+        """
+            user_request: The original user request
+            step_number: the number of times we've called the LLM for this request.
+                Used to keep track of whether it's a fresh request we're processing (step number 0), or if we're already in the middle of one.
+                Without it the LLM kept looping after finishing the user request.
+                Also, it is needed because the LLM we are using doesn't have a stateful/assistant mode.
+        """
+        instructions = self.llm.get_instructions_for_objective(user_request, step_num)
+        # Send to Interpreter and Executor
+        self.interpreter.process(instructions["steps"])  # GPTToLocalInterface.py
+        if instructions["done"]:
+            # Communicate Results
+            print(instructions["done"])
+        else:
+            # if not done, continue to next phase
+            self.execute(user_request, step_num + 1)
+
+class Interpreter:
+    def __init__(self):
+        pass
+    def process(self, json_commands):
+        for command in json_commands:
+            function_name = command["function"]
+            parameters = command.get('parameters', {})
+            self.execute_function(function_name, parameters)
+    def execute_function(self, function_name, parameters):
+        """
+            We are expecting only two types of function calls below
+            1. time.sleep() - to wait for web pages, applications, and other things to load.
+            2. pyautogui calls to interact with system's mouse and keyboard.
+        """
+        if function_name == "sleep" and parameters.get("secs"):
+            sleep(parameters.get("secs"))
+        elif hasattr(pyautogui, function_name):
+            # Execute the corresponding pyautogui function i.e. Keyboard or Mouse commands.
+            function_to_call = getattr(pyautogui, function_name)
+            # Special handling for the 'write' function
+            if function_name == 'write' and ('string' in parameters or 'text' in parameters):
+                # 'write' function expects a string, not a 'text' keyword argument. LLM sometimes gets confused on what to send.
+                string_to_write = parameters.get('string') or parameters.get('text')
+                interval = parameters.get('interval', 0.05)
+                function_to_call(string_to_write, interval=interval)
+            elif function_name == 'press' and ('keys' in parameters or 'key' in parameters):
+                # 'press' can take a list of keys or a single key
+                keys_to_press = parameters['keys'] or parameters.get('key')
+                presses = parameters.get('presses', 1)
+                interval = parameters.get('interval', 0.0)
+                for key in keys_to_press:
+                    function_to_call(key, presses=presses, interval=interval)
+            elif function_name == 'hotkey':
+                # 'hotkey' function expects multiple key arguments, not a list
+                function_to_call(*parameters['keys'])
+            else:
+                # For other functions, pass the parameters as they are
+                function_to_call(**parameters)
+        else:
+            print(f"No such function {function_name} in our interface's interpreter")
+class LLM:
+    def __init__(self):
+        self.client = OpenAI()
+        self.model = "gpt-4o"
+        with open('context.txt', 'r') as file:
+            self.context = file.read()
+        self.context += f"\nDefault browser is {local_info.default_browser}."
+        self.context += f" Locally installed apps are {','.join(local_info.locally_installed_apps)}."
+        self.context += f" Primary screen size is {Screen().get_size()}.\n"
+        self.assistant = self.client.beta.assistants.create(
+            name="Open Interface Backend",
+            instructions=self.context,
+            model="gpt-4o",
+        )
+        self.thread = self.client.beta.threads.create()
+    def get_instructions_for_objective(self, original_user_request, step_num=0):
+        openai_file_id_for_screenshot, temp_filename = self.upload_screenshot_and_get_file_id()
+        formatted_user_request = self.format_user_request_for_llm(original_user_request, step_num,
+                                                                  openai_file_id_for_screenshot)
+        llm_response = self.send_message_to_llm_v2(formatted_user_request)
+        json_instructions: dict[str, Any] = self.convert_llm_response_to_json_v2(llm_response)
+        return json_instructions
+    def format_user_request_for_llm(self, original_user_request, step_num, openai_file_id_for_screenshot) -> list[
+        dict[str, Any]]:
+        request_data: str = json.dumps({
+            'original_user_request': original_user_request,
+            'step_num': step_num
+        })
+        content = [
+            {
+                'type': 'text',
+                'text': request_data
+            },
+            {
+                'type': 'image_file',
+                'image_file': {
+                    'file_id': openai_file_id_for_screenshot
+                }
+            }
+        ]
+        return content
+    def send_message_to_llm_v2(self, formatted_user_request) -> Message:
+        message = self.client.beta.threads.messages.create(
+            thread_id=self.thread.id,
+            role="user",
+            content=formatted_user_request
+        )
+        run = self.client.beta.threads.runs.create_and_poll(
+            thread_id=self.thread.id,
+            assistant_id=self.assistant.id,
+            instructions=''
+        )
+        while run.status != 'completed':
+            print(f'Waiting for response, sleeping for 1. run.status={run.status}')
+            time.sleep(1)
+            if run.status == 'failed':
+                print(f'failed run run.required_action:{run.required_action} run.last_error: {run.last_error}\n\n')
+                return None
+        if run.status == 'completed':
+            # NOTE: Apparently right now the API doesn't have a way to retrieve just the last message???
+            #  So instead you get all messages and take the latest one
+            response = self.client.beta.threads.messages.list(
+                thread_id=self.thread.id)
+            return response.data[0]
+        else:
+            print("Run did not complete successfully.")
+            return None
+    def convert_llm_response_to_json_v2(self, llm_response: ChatCompletion) -> dict[str, Any]:
+        llm_response_data: str = llm_response.content[0].text.value.strip()
+        start_index = llm_response_data.find('{')
+        end_index = llm_response_data.rfind('}')
+        try:
+            json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
+        except Exception as e:
+            print(f'Error while parsing JSON response - {e}')
+            json_response = {}
+        return json_response
+End of code
diff --git a/app/ui.py b/app/ui.py
@@ -60,14 +60,15 @@ def create_widgets(self) -> None:
             radio_frame.pack(padx=20, pady=10)  # Add padding around the frame
 
             models = [
-                ('GPT-4v (Most Accurate, Slowest)', 'gpt-4-vision-preview'),
-                ('GPT-4o (Medium Accurate, Medium Fast)', 'gpt-4o'),
-                ('GPT-4-Turbo (Least Accurate, Fastest)', 'gpt-4-turbo'),
+                ('GPT-4o (Default. Medium-Accurate, Medium-Fast)', 'gpt-4o'),
+                ('GPT-4o-mini (Cheapest, Fastest)', 'gpt-4o-mini'),
+                ('GPT-4v (Deprecated. Most-Accurate, Slowest)', 'gpt-4-vision-preview'),
+                ('GPT-4-Turbo (Least Accurate, Fast)', 'gpt-4-turbo'),
                 ('Custom (Specify Settings Below)', 'custom')
             ]
             for text, value in models:
                 ttk.Radiobutton(radio_frame, text=text, value=value, variable=self.model_var, bootstyle="info").pack(
-                    anchor=ttk.W)
+                    anchor=ttk.W, pady=5)
 
             label_base_url = ttk.Label(self, text='Custom OpenAI-Like API Model Base URL', bootstyle="secondary")
             label_base_url.pack(pady=10)
@@ -179,7 +180,7 @@ def create_widgets(self) -> None:
             advanced_settings_button.pack(pady=(0, 10))
 
             # Hyperlink Label
-            link_label = ttk.Label(self, text='Instructions', bootstyle="primary")
+            link_label = ttk.Label(self, text='Setup Instructions', bootstyle="primary")
             link_label.pack()
             link_label.bind('<Button-1>', lambda e: open_link(
                 'https://github.com/AmberSahdev/Open-Interface?tab=readme-ov-file#setup-%EF%B8%8F'))