Improved debugging process and enabled splitting of app development into tasks and then into steps

- split step implementation into different functions - standardized the return value in the implementation process - { "success": True } - added propagation of errors back to the recursion level 0 with TooDeepRecursionError and TokenLimitError - created new class Debugger and moved debugging in it
2026-01-23 10:28:53 +01:00 · 2023-09-12 21:32:56 +02:00
parent 19ac692509
commit 151aa051e2
5 changed files with 205 additions and 154 deletions
--- a/pilot/helpers/agents/Developer.py
+++ b/pilot/helpers/agents/Developer.py
@@ -3,6 +3,8 @@ import uuid
 from termcolor import colored
 from helpers.exceptions.TokenLimitError import TokenLimitError
 from const.code_execution import MAX_COMMAND_DEBUG_TRIES
+from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
+from helpers.Debugger import Debugger
 from utils.questionary import styled_text
 from helpers.files import update_file
 from utils.utils import step_already_finished
@@ -11,7 +13,7 @@ from logger.logger import logger
 from helpers.Agent import Agent
 from helpers.AgentConvo import AgentConvo
 from utils.utils import execute_step, array_of_objects_to_string, generate_app_data
-from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response, debug
+from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response
 from const.function_calls import FILTER_OS_TECHNOLOGIES, DEVELOPMENT_PLAN, EXECUTE_COMMANDS, GET_TEST_TYPE, DEV_TASKS_BREAKDOWN, IMPLEMENT_TASK
 from database.database import save_progress, get_progress_steps, save_file_description
 from utils.utils import get_os_info
@@ -21,6 +23,7 @@ class Developer(Agent):
    def __init__(self, project):
        super().__init__('full_stack_developer', project)
        self.run_command = None
+        self.debugger = Debugger(self)

    def start_coding(self):
        self.project.current_step = 'coding'
@@ -59,98 +62,70 @@ class Developer(Agent):

        task_steps = convo_dev_task.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK)
        convo_dev_task.remove_last_x_messages(2)
-        self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True)
+        self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True, is_root_task=True)

-    def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, test_after_code_changes=True, continue_development=False, development_task=None):
-        function_uuid = str(uuid.uuid4())
-        convo.save_branch(function_uuid)
+    def step_code_change(self, convo, step, i, test_after_code_changes):
+        if step['type'] == 'code_change' and 'code_change_description' in step:
+            # TODO this should be refactored so it always uses the same function call
+            print(f'Implementing code changes for `{step["code_change_description"]}`')
+            code_monkey = CodeMonkey(self.project, self)
+            updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i)
+            if test_after_code_changes:
+                return self.test_code_changes(code_monkey, updated_convo)
+            else:
+                return { "success": True }

-        for (i, step) in enumerate(task_steps):
+        elif step['type'] == 'code_change':
+            # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
+            if 'code_change' not in step:
+                data = step
+            else:
+                data = step['code_change']
+            self.project.save_file(data)
+            # TODO end

-            tries = 0
-            max_retry_times = 1
+    def step_command_run(self, convo, step, i):
+        # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
+        if isinstance(step['command'], str):
+            data = step
+        else:
+            data = step['command']
+        # TODO END
+        additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
+        return run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message)

-            step_uuid = str(uuid.uuid4())
-            convo.save_branch(step_uuid)
+    def step_human_intervention(self, convo, step):
+        while True:
+            human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description']
+            response = self.project.ask_for_human_intervention('I need human intervention:',
+                human_intervention_description,
+                cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True, return_cli_response=True) })

-            while tries < max_retry_times:
-                tries += 1
-                try:
-                    if reset_convo:
-                        convo.load_branch(function_uuid)
+            if 'user_input' not in response:
+                continue

-                    if max_retry_times > 1:
-                        # this means that we are retrying the entire development step
-                        convo.load_branch(step_uuid)
+            if response['user_input'] != 'continue':
+                return_value = self.debugger.debug(convo, user_input=response['user_input'], issue_description=step['human_intervention_description'])
+                return_value['user_input'] = response['user_input']
+                return return_value
+            else:
+                return response

-                    if step['type'] == 'command':
-                        # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
-                        if isinstance(step['command'], str):
-                            data = step
-                        else:
-                            data = step['command']
-                        # TODO END
-                        additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
-                        run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message)
+    def step_test(self, convo, test_command):
+        should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt',
+            test_command)
+        if should_rerun_command == 'NO':
+            return { "success": True }
+        elif should_rerun_command == 'YES':
+            cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
+            if llm_response == 'NEEDS_DEBUGGING':
+                print(colored(f'Got incorrect CLI response:', 'red'))
+                print(cli_response)
+                print(colored('-------------------', 'red'))

-                    elif step['type'] == 'code_change' and 'code_change_description' in step:
-                        # TODO this should be refactored so it always uses the same function call
-                        print(f'Implementing code changes for `{step["code_change_description"]}`')
-                        code_monkey = CodeMonkey(self.project, self)
-                        updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i)
-                        if test_after_code_changes:
-                            self.test_code_changes(code_monkey, updated_convo)
-
-                    elif step['type'] == 'code_change':
-                        # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
-                        if 'code_change' not in step:
-                            data = step
-                        else:
-                            data = step['code_change']
-                        self.project.save_file(data)
-                        # TODO end
-
-                    elif step['type'] == 'human_intervention':
-                        human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description']
-                        user_feedback = self.project.ask_for_human_intervention('I need human intervention:',
-                            human_intervention_description,
-                            cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True) })
-
-                        if user_feedback is not None and user_feedback != 'continue':
-                            debug(convo, user_input=user_feedback, issue_description=step['human_intervention_description'])
-
-                    if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']):
-                        should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt',
-                            test_command)
-                        if should_rerun_command == 'NO':
-                            return True
-                        elif should_rerun_command == 'YES':
-                            cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
-                            if llm_response == 'NEEDS_DEBUGGING':
-                                print(colored(f'Got incorrect CLI response:', 'red'))
-                                print(cli_response)
-                                print(colored('-------------------', 'red'))
-                            if llm_response == 'DONE':
-                                return True
-                except TokenLimitError as e:
-                    if max_retry_times >= MAX_COMMAND_DEBUG_TRIES:
-                        print(colored('I can\'t figure this out - sorry! Closing...'))
-                        exit(0)
-
-                    print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold']))
-                    print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold']))
-
-                    answer = styled_text(
-                        self.project,
-                        'Type y/n'
-                    )
-
-                    if answer == 'y':
-                        max_retry_times += 1
-                    else:
-                        print(colored('Ok - exiting...', 'red', attrs=['bold']))
-                        exit(0)
+            return { "success": llm_response == 'DONE', "cli_response": cli_response, "llm_response": llm_response }

+    def task_postprocessing(self, convo, development_task, continue_development, task_result):
        self.run_command = convo.send_message('development/get_run_command.prompt', {})
        if self.run_command.startswith('`'):
            self.run_command = self.run_command[1:]
@@ -160,22 +135,130 @@ class Developer(Agent):
        if development_task is not None:
            convo.remove_last_x_messages(2)
            detailed_user_review_goal = convo.send_message('development/define_user_review_goal.prompt', {})
+            convo.remove_last_x_messages(2)

-        if continue_development:
-            continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None
-            self.continue_development(convo, continue_description)
+        try:
+            if continue_development:
+                continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None
+                return self.continue_development(convo, continue_description)
+        except TooDeepRecursionError as e:
+            return self.dev_help_needed(e.message)
+
+        return task_result
+
+    def should_retry_step_implementation(self, step, step_implementation_try):
+        if step_implementation_try >= MAX_COMMAND_DEBUG_TRIES:
+            self.dev_help_needed(step)
+
+        print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold']))
+        print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold']))
+
+        answer = ''
+        while answer != 'y':
+            answer = styled_text(
+                self.project,
+                'Type y/n'
+            )
+
+            if answer == 'n':
+                return self.dev_help_needed(step)
+
+        return { "success": False, "retry": True }
+
+    def dev_help_needed(self, description):
+
+        # TODO remove this
+        def extract_substring(s):
+            start_idx = s.find('```')
+            end_idx = s.find('```', start_idx + 3)
+
+            if start_idx != -1 and end_idx != -1:
+                return s[start_idx + 3:end_idx]
+            else:
+                return s
+        # TODO end
+
+        answer = ''
+        while answer != 'continue':
+            print(colored(f'\n----------------------------- I need your help ------------------------------', 'red', attrs=['bold']))
+            print(colored(f'Please implement the following task', 'red', attrs=['bold']))
+
+            print(colored(extract_substring(description), 'red'))
+            print(colored(f'\n-----------------------------------------------------------------------------', 'red', attrs=['bold']))
+            answer = styled_text(
+                self.project,
+                'Once you\'re done, type "continue"?'
+            )
+
+        return { "success": True, "user_input": answer }
+
+    def execute_task(self, convo, task_steps, test_command=None, reset_convo=True,
+                     test_after_code_changes=True, continue_development=False,
+                     development_task=None, is_root_task=False):
+        function_uuid = str(uuid.uuid4())
+        convo.save_branch(function_uuid)
+
+        for (i, step) in enumerate(task_steps):
+
+            result = None
+            step_implementation_try = 0
+
+            while True:
+                try:
+                    if reset_convo:
+                        convo.load_branch(function_uuid)
+
+                    if step['type'] == 'command':
+                        result = self.step_command_run(convo, step, i)
+
+                    elif step['type'] == 'code_change':
+                        result = self.step_code_change(convo, step, i, test_after_code_changes)
+
+                    elif step['type'] == 'human_intervention':
+                        result = self.step_human_intervention(convo, step)
+
+                    if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']):
+                        is_fixed = self.step_test(convo, test_command)
+                        if is_fixed['success']:
+                            return is_fixed
+                        else:
+                            result = is_fixed
+
+                    break
+                except TokenLimitError as e:
+                    if is_root_task:
+                        response = self.should_retry_step_implementation(step, step_implementation_try)
+                        if 'retry' in response:
+                            # TODO we can rewind this convo even more
+                            convo.load_branch(function_uuid)
+                            continue
+                        elif 'success' in response:
+                            result = response
+                            break
+                    else:
+                        raise e
+                except TooDeepRecursionError as e:
+                    if is_root_task:
+                        result = self.dev_help_needed(step)
+                        break
+                    else:
+                        raise e
+
+        convo.load_branch(function_uuid)
+        return self.task_postprocessing(convo, development_task, continue_development, result)

    def continue_development(self, iteration_convo, continue_description=''):
        while True:
            user_description = ('Here is a description of what should be working: \n\n' + colored(continue_description, 'blue', attrs=['bold']) + '\n') if continue_description != '' else ''
            user_description = 'Can you check if the app works please? ' + user_description + '\nIf you want to run the app, ' + colored('just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold'])
-            continue_description = ''
-            user_feedback = self.project.ask_for_human_intervention(
+            # continue_description = ''
+            response = self.project.ask_for_human_intervention(
                user_description,
-                cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True) })
+                cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True, return_cli_response=True, is_root_task=True) })

+            user_feedback = response['user_input'] if 'user_input' in response else None
            if user_feedback == 'continue':
-                return True
+                return { "success": True, "user_input": user_feedback }

            if user_feedback is not None:
                iteration_convo = AgentConvo(self)
@@ -192,11 +275,12 @@ class Developer(Agent):
                    "user_input": user_feedback,
                })

-                # debug(iteration_convo, user_input=user_feedback)
+                # self.debugger.debug(iteration_convo, user_input=user_feedback)

                task_steps = iteration_convo.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK)
                iteration_convo.remove_last_x_messages(2)
-                self.execute_task(iteration_convo, task_steps, continue_development=False)
+
+                self.execute_task(iteration_convo, task_steps, is_root_task=True)


    def set_up_environment(self):
@@ -274,17 +358,24 @@ class Developer(Agent):
            GET_TEST_TYPE)

        if test_type == 'command_test':
-            run_command_until_success(command['command'], command['timeout'], convo)
+            return run_command_until_success(command['command'], command['timeout'], convo)
        elif test_type == 'automated_test':
-            code_monkey.implement_code_changes(convo, automated_test_description, 0)
+            # TODO get code monkey to implement the automated test
+            pass
        elif test_type == 'manual_test':
            # TODO make the message better
-            user_feedback = self.project.ask_for_human_intervention(
+            response = self.project.ask_for_human_intervention(
                'Message from Pilot: I need your help. Can you please test if this was successful?',
                manual_test_description
            )
-            if user_feedback is not None:
-                debug(convo, user_input=user_feedback, issue_description=manual_test_description)
+
+            user_feedback = response['user_input']
+            if user_feedback is not None and user_feedback != 'continue':
+                return_value = self.debugger.debug(convo, user_input=user_feedback, issue_description=manual_test_description)
+                return_value['user_input'] = user_feedback
+                return return_value
+            else:
+                return { "success": True, "user_input": user_feedback }

    def implement_step(self, convo, step_index, type, description):
        # TODO remove hardcoded folder path