From 151aa051e260e5f7eb99ed5a3e04ca36089b2ded Mon Sep 17 00:00:00 2001 From: Zvonimir Sabljic Date: Tue, 12 Sep 2023 21:32:56 +0200 Subject: [PATCH] Improved debugging process and enabled splitting of app development into tasks and then into steps - split step implementation into different functions - standardized the return value in the implementation process - { "success": True } - added propagation of errors back to the recursion level 0 with TooDeepRecursionError and TokenLimitError - created new class Debugger and moved debugging in it --- pilot/const/code_execution.py | 1 + pilot/helpers/Project.py | 2 +- pilot/helpers/agents/Developer.py | 287 ++++++++++++++++++++---------- pilot/helpers/cli.py | 65 ++----- pilot/utils/llm_connection.py | 4 +- 5 files changed, 205 insertions(+), 154 deletions(-) diff --git a/pilot/const/code_execution.py b/pilot/const/code_execution.py index f4f3f4e..3ebdf22 100644 --- a/pilot/const/code_execution.py +++ b/pilot/const/code_execution.py @@ -1,4 +1,5 @@ MAX_COMMAND_DEBUG_TRIES = 3 +MAX_RECUSION_LAYER = 3 MIN_COMMAND_RUN_TIME = 2000 MAX_COMMAND_RUN_TIME = 30000 MAX_COMMAND_OUTPUT_LENGTH = 2000 diff --git a/pilot/helpers/Project.py b/pilot/helpers/Project.py index cff06e6..b89681b 100644 --- a/pilot/helpers/Project.py +++ b/pilot/helpers/Project.py @@ -290,4 +290,4 @@ class Project: if answer in cbs: return cbs[answer]() elif answer != '': - return answer + return { 'user_input': answer } diff --git a/pilot/helpers/agents/Developer.py b/pilot/helpers/agents/Developer.py index 66e6fbf..6792c14 100644 --- a/pilot/helpers/agents/Developer.py +++ b/pilot/helpers/agents/Developer.py @@ -3,6 +3,8 @@ import uuid from termcolor import colored from helpers.exceptions.TokenLimitError import TokenLimitError from const.code_execution import MAX_COMMAND_DEBUG_TRIES +from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError +from helpers.Debugger import Debugger from utils.questionary import styled_text from helpers.files import update_file from utils.utils import step_already_finished @@ -11,7 +13,7 @@ from logger.logger import logger from helpers.Agent import Agent from helpers.AgentConvo import AgentConvo from utils.utils import execute_step, array_of_objects_to_string, generate_app_data -from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response, debug +from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response from const.function_calls import FILTER_OS_TECHNOLOGIES, DEVELOPMENT_PLAN, EXECUTE_COMMANDS, GET_TEST_TYPE, DEV_TASKS_BREAKDOWN, IMPLEMENT_TASK from database.database import save_progress, get_progress_steps, save_file_description from utils.utils import get_os_info @@ -21,6 +23,7 @@ class Developer(Agent): def __init__(self, project): super().__init__('full_stack_developer', project) self.run_command = None + self.debugger = Debugger(self) def start_coding(self): self.project.current_step = 'coding' @@ -59,98 +62,70 @@ class Developer(Agent): task_steps = convo_dev_task.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK) convo_dev_task.remove_last_x_messages(2) - self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True) + self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True, is_root_task=True) - def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, test_after_code_changes=True, continue_development=False, development_task=None): - function_uuid = str(uuid.uuid4()) - convo.save_branch(function_uuid) + def step_code_change(self, convo, step, i, test_after_code_changes): + if step['type'] == 'code_change' and 'code_change_description' in step: + # TODO this should be refactored so it always uses the same function call + print(f'Implementing code changes for `{step["code_change_description"]}`') + code_monkey = CodeMonkey(self.project, self) + updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i) + if test_after_code_changes: + return self.test_code_changes(code_monkey, updated_convo) + else: + return { "success": True } - for (i, step) in enumerate(task_steps): + elif step['type'] == 'code_change': + # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure + if 'code_change' not in step: + data = step + else: + data = step['code_change'] + self.project.save_file(data) + # TODO end - tries = 0 - max_retry_times = 1 + def step_command_run(self, convo, step, i): + # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure + if isinstance(step['command'], str): + data = step + else: + data = step['command'] + # TODO END + additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n' + return run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message) - step_uuid = str(uuid.uuid4()) - convo.save_branch(step_uuid) + def step_human_intervention(self, convo, step): + while True: + human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description'] + response = self.project.ask_for_human_intervention('I need human intervention:', + human_intervention_description, + cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True, return_cli_response=True) }) - while tries < max_retry_times: - tries += 1 - try: - if reset_convo: - convo.load_branch(function_uuid) + if 'user_input' not in response: + continue - if max_retry_times > 1: - # this means that we are retrying the entire development step - convo.load_branch(step_uuid) + if response['user_input'] != 'continue': + return_value = self.debugger.debug(convo, user_input=response['user_input'], issue_description=step['human_intervention_description']) + return_value['user_input'] = response['user_input'] + return return_value + else: + return response - if step['type'] == 'command': - # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure - if isinstance(step['command'], str): - data = step - else: - data = step['command'] - # TODO END - additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n' - run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message) + def step_test(self, convo, test_command): + should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt', + test_command) + if should_rerun_command == 'NO': + return { "success": True } + elif should_rerun_command == 'YES': + cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo) + if llm_response == 'NEEDS_DEBUGGING': + print(colored(f'Got incorrect CLI response:', 'red')) + print(cli_response) + print(colored('-------------------', 'red')) - elif step['type'] == 'code_change' and 'code_change_description' in step: - # TODO this should be refactored so it always uses the same function call - print(f'Implementing code changes for `{step["code_change_description"]}`') - code_monkey = CodeMonkey(self.project, self) - updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i) - if test_after_code_changes: - self.test_code_changes(code_monkey, updated_convo) - - elif step['type'] == 'code_change': - # TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure - if 'code_change' not in step: - data = step - else: - data = step['code_change'] - self.project.save_file(data) - # TODO end - - elif step['type'] == 'human_intervention': - human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description'] - user_feedback = self.project.ask_for_human_intervention('I need human intervention:', - human_intervention_description, - cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True) }) - - if user_feedback is not None and user_feedback != 'continue': - debug(convo, user_input=user_feedback, issue_description=step['human_intervention_description']) - - if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']): - should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt', - test_command) - if should_rerun_command == 'NO': - return True - elif should_rerun_command == 'YES': - cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo) - if llm_response == 'NEEDS_DEBUGGING': - print(colored(f'Got incorrect CLI response:', 'red')) - print(cli_response) - print(colored('-------------------', 'red')) - if llm_response == 'DONE': - return True - except TokenLimitError as e: - if max_retry_times >= MAX_COMMAND_DEBUG_TRIES: - print(colored('I can\'t figure this out - sorry! Closing...')) - exit(0) - - print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold'])) - print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold'])) - - answer = styled_text( - self.project, - 'Type y/n' - ) - - if answer == 'y': - max_retry_times += 1 - else: - print(colored('Ok - exiting...', 'red', attrs=['bold'])) - exit(0) + return { "success": llm_response == 'DONE', "cli_response": cli_response, "llm_response": llm_response } + def task_postprocessing(self, convo, development_task, continue_development, task_result): self.run_command = convo.send_message('development/get_run_command.prompt', {}) if self.run_command.startswith('`'): self.run_command = self.run_command[1:] @@ -160,22 +135,130 @@ class Developer(Agent): if development_task is not None: convo.remove_last_x_messages(2) detailed_user_review_goal = convo.send_message('development/define_user_review_goal.prompt', {}) + convo.remove_last_x_messages(2) - if continue_development: - continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None - self.continue_development(convo, continue_description) + try: + if continue_development: + continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None + return self.continue_development(convo, continue_description) + except TooDeepRecursionError as e: + return self.dev_help_needed(e.message) + + return task_result + + def should_retry_step_implementation(self, step, step_implementation_try): + if step_implementation_try >= MAX_COMMAND_DEBUG_TRIES: + self.dev_help_needed(step) + + print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold'])) + print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold'])) + + answer = '' + while answer != 'y': + answer = styled_text( + self.project, + 'Type y/n' + ) + + if answer == 'n': + return self.dev_help_needed(step) + + return { "success": False, "retry": True } + + def dev_help_needed(self, description): + + # TODO remove this + def extract_substring(s): + start_idx = s.find('```') + end_idx = s.find('```', start_idx + 3) + + if start_idx != -1 and end_idx != -1: + return s[start_idx + 3:end_idx] + else: + return s + # TODO end + + answer = '' + while answer != 'continue': + print(colored(f'\n----------------------------- I need your help ------------------------------', 'red', attrs=['bold'])) + print(colored(f'Please implement the following task', 'red', attrs=['bold'])) + + print(colored(extract_substring(description), 'red')) + print(colored(f'\n-----------------------------------------------------------------------------', 'red', attrs=['bold'])) + answer = styled_text( + self.project, + 'Once you\'re done, type "continue"?' + ) + + return { "success": True, "user_input": answer } + + def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, + test_after_code_changes=True, continue_development=False, + development_task=None, is_root_task=False): + function_uuid = str(uuid.uuid4()) + convo.save_branch(function_uuid) + + for (i, step) in enumerate(task_steps): + + result = None + step_implementation_try = 0 + + while True: + try: + if reset_convo: + convo.load_branch(function_uuid) + + if step['type'] == 'command': + result = self.step_command_run(convo, step, i) + + elif step['type'] == 'code_change': + result = self.step_code_change(convo, step, i, test_after_code_changes) + + elif step['type'] == 'human_intervention': + result = self.step_human_intervention(convo, step) + + if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']): + is_fixed = self.step_test(convo, test_command) + if is_fixed['success']: + return is_fixed + else: + result = is_fixed + + break + except TokenLimitError as e: + if is_root_task: + response = self.should_retry_step_implementation(step, step_implementation_try) + if 'retry' in response: + # TODO we can rewind this convo even more + convo.load_branch(function_uuid) + continue + elif 'success' in response: + result = response + break + else: + raise e + except TooDeepRecursionError as e: + if is_root_task: + result = self.dev_help_needed(step) + break + else: + raise e + + convo.load_branch(function_uuid) + return self.task_postprocessing(convo, development_task, continue_development, result) def continue_development(self, iteration_convo, continue_description=''): while True: user_description = ('Here is a description of what should be working: \n\n' + colored(continue_description, 'blue', attrs=['bold']) + '\n') if continue_description != '' else '' user_description = 'Can you check if the app works please? ' + user_description + '\nIf you want to run the app, ' + colored('just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) - continue_description = '' - user_feedback = self.project.ask_for_human_intervention( + # continue_description = '' + response = self.project.ask_for_human_intervention( user_description, - cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True) }) + cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True, return_cli_response=True, is_root_task=True) }) + user_feedback = response['user_input'] if 'user_input' in response else None if user_feedback == 'continue': - return True + return { "success": True, "user_input": user_feedback } if user_feedback is not None: iteration_convo = AgentConvo(self) @@ -192,11 +275,12 @@ class Developer(Agent): "user_input": user_feedback, }) - # debug(iteration_convo, user_input=user_feedback) + # self.debugger.debug(iteration_convo, user_input=user_feedback) task_steps = iteration_convo.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK) iteration_convo.remove_last_x_messages(2) - self.execute_task(iteration_convo, task_steps, continue_development=False) + + self.execute_task(iteration_convo, task_steps, is_root_task=True) def set_up_environment(self): @@ -274,17 +358,24 @@ class Developer(Agent): GET_TEST_TYPE) if test_type == 'command_test': - run_command_until_success(command['command'], command['timeout'], convo) + return run_command_until_success(command['command'], command['timeout'], convo) elif test_type == 'automated_test': - code_monkey.implement_code_changes(convo, automated_test_description, 0) + # TODO get code monkey to implement the automated test + pass elif test_type == 'manual_test': # TODO make the message better - user_feedback = self.project.ask_for_human_intervention( + response = self.project.ask_for_human_intervention( 'Message from Pilot: I need your help. Can you please test if this was successful?', manual_test_description ) - if user_feedback is not None: - debug(convo, user_input=user_feedback, issue_description=manual_test_description) + + user_feedback = response['user_input'] + if user_feedback is not None and user_feedback != 'continue': + return_value = self.debugger.debug(convo, user_input=user_feedback, issue_description=manual_test_description) + return_value['user_input'] = user_feedback + return return_value + else: + return { "success": True, "user_input": user_feedback } def implement_step(self, convo, step_index, type, description): # TODO remove hardcoded folder path diff --git a/pilot/helpers/cli.py b/pilot/helpers/cli.py index 3cee61e..b6ed889 100644 --- a/pilot/helpers/cli.py +++ b/pilot/helpers/cli.py @@ -10,6 +10,7 @@ import platform from termcolor import colored from database.database import get_command_run_from_hash_id, save_command_run from const.function_calls import DEBUG_STEPS_BREAKDOWN +from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError from utils.questionary import styled_text from const.code_execution import MAX_COMMAND_DEBUG_TRIES, MIN_COMMAND_RUN_TIME, MAX_COMMAND_RUN_TIME, MAX_COMMAND_OUTPUT_LENGTH @@ -251,7 +252,7 @@ def execute_command_and_check_cli_response(command, timeout, convo): { 'cli_response': cli_response, 'command': command }) return cli_response, response -def run_command_until_success(command, timeout, convo, additional_message=None, force=False): +def run_command_until_success(command, timeout, convo, additional_message=None, force=False, return_cli_response=False, is_root_task=False): """ Run a command until it succeeds or reaches a timeout. @@ -271,55 +272,13 @@ def run_command_until_success(command, timeout, convo, additional_message=None, print(cli_response) print(colored('-------------------', 'red')) - debug(convo, {'command': command, 'timeout': timeout}) - - - -def debug(convo, command=None, user_input=None, issue_description=None): - """ - Debug a conversation. - - Args: - convo (AgentConvo): The conversation object. - command (dict, optional): The command to debug. Default is None. - user_input (str, optional): User input for debugging. Default is None. - issue_description (str, optional): Description of the issue to debug. Default is None. - - Returns: - bool: True if debugging was successful, False otherwise. - """ - function_uuid = str(uuid.uuid4()) - convo.save_branch(function_uuid) - success = False - - for i in range(MAX_COMMAND_DEBUG_TRIES): - if success: - break - - convo.load_branch(function_uuid) - - debugging_plan = convo.send_message('dev_ops/debug.prompt', - { 'command': command['command'] if command is not None else None, 'user_input': user_input, 'issue_description': issue_description }, - DEBUG_STEPS_BREAKDOWN) - - # TODO refactor to nicely get the developer agent - success = convo.agent.project.developer.execute_task( - convo, - debugging_plan, - command, - False, - False) - - - if not success: - # TODO explain better how should the user approach debugging - # we can copy the entire convo to clipboard so they can paste it in the playground - user_input = convo.agent.project.ask_for_human_intervention( - 'It seems like I cannot debug this problem by myself. Can you please help me and try debugging it yourself?' if user_input is None else f'Can you check this again:\n{issue_description}?', - command - ) - - if user_input == 'continue': - success = True - - return success + try: + # This catch is necessary to return the correct value (cli_response) to continue development function so + # the developer can debug the appropriate issue + # this snippet represents the first entry point into debugging recursion because of return_cli_response + return convo.agent.debugger.debug(convo, {'command': command, 'timeout': timeout}, is_root_task=is_root_task) + except TooDeepRecursionError as e: + # this is only to put appropriate message in the response after TooDeepRecursionError is raised + raise TooDeepRecursionError(cli_response) if return_cli_response else e + else: + return { 'success': True, 'cli_response': cli_response } diff --git a/pilot/utils/llm_connection.py b/pilot/utils/llm_connection.py index 9c1e845..a14c1bb 100644 --- a/pilot/utils/llm_connection.py +++ b/pilot/utils/llm_connection.py @@ -123,7 +123,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, min_tokens=MIN_TO # Check if the error message is related to token limit if "context_length_exceeded" in error_message.lower(): - raise Exception(f'Too many tokens in the request. Please try to continue the project with some previous development step.') + raise TokenLimitError(tokens_in_messages + min_tokens, MAX_GPT_MODEL_TOKENS) else: print('The request to OpenAI API failed. Here is the error message:') print(e) @@ -153,7 +153,7 @@ def retry_on_exception(func): # If the specific error "context_length_exceeded" is present, simply return without retry if "context_length_exceeded" in err_str: - raise Exception("context_length_exceeded") + raise TokenLimitError(tokens_in_messages + min_tokens, MAX_GPT_MODEL_TOKENS) print(colored(f'There was a problem with request to openai API:', 'red')) print(err_str)