Improved debugging process and enabled splitting of app development into tasks and then into steps

- split step implementation into different functions
- standardized the return value in the implementation process - { "success": True }
- added propagation of errors back to the recursion level 0 with TooDeepRecursionError and TokenLimitError
- created new class Debugger and moved debugging in it
This commit is contained in:
Zvonimir Sabljic
2023-09-12 21:32:56 +02:00
parent 19ac692509
commit 151aa051e2
5 changed files with 205 additions and 154 deletions

View File

@@ -1,4 +1,5 @@
MAX_COMMAND_DEBUG_TRIES = 3
MAX_RECUSION_LAYER = 3
MIN_COMMAND_RUN_TIME = 2000
MAX_COMMAND_RUN_TIME = 30000
MAX_COMMAND_OUTPUT_LENGTH = 2000

View File

@@ -290,4 +290,4 @@ class Project:
if answer in cbs:
return cbs[answer]()
elif answer != '':
return answer
return { 'user_input': answer }

View File

@@ -3,6 +3,8 @@ import uuid
from termcolor import colored
from helpers.exceptions.TokenLimitError import TokenLimitError
from const.code_execution import MAX_COMMAND_DEBUG_TRIES
from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
from helpers.Debugger import Debugger
from utils.questionary import styled_text
from helpers.files import update_file
from utils.utils import step_already_finished
@@ -11,7 +13,7 @@ from logger.logger import logger
from helpers.Agent import Agent
from helpers.AgentConvo import AgentConvo
from utils.utils import execute_step, array_of_objects_to_string, generate_app_data
from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response, debug
from helpers.cli import build_directory_tree, run_command_until_success, execute_command_and_check_cli_response
from const.function_calls import FILTER_OS_TECHNOLOGIES, DEVELOPMENT_PLAN, EXECUTE_COMMANDS, GET_TEST_TYPE, DEV_TASKS_BREAKDOWN, IMPLEMENT_TASK
from database.database import save_progress, get_progress_steps, save_file_description
from utils.utils import get_os_info
@@ -21,6 +23,7 @@ class Developer(Agent):
def __init__(self, project):
super().__init__('full_stack_developer', project)
self.run_command = None
self.debugger = Debugger(self)
def start_coding(self):
self.project.current_step = 'coding'
@@ -59,98 +62,70 @@ class Developer(Agent):
task_steps = convo_dev_task.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK)
convo_dev_task.remove_last_x_messages(2)
self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True)
self.execute_task(convo_dev_task, task_steps, development_task=development_task, continue_development=True, is_root_task=True)
def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, test_after_code_changes=True, continue_development=False, development_task=None):
function_uuid = str(uuid.uuid4())
convo.save_branch(function_uuid)
def step_code_change(self, convo, step, i, test_after_code_changes):
if step['type'] == 'code_change' and 'code_change_description' in step:
# TODO this should be refactored so it always uses the same function call
print(f'Implementing code changes for `{step["code_change_description"]}`')
code_monkey = CodeMonkey(self.project, self)
updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i)
if test_after_code_changes:
return self.test_code_changes(code_monkey, updated_convo)
else:
return { "success": True }
for (i, step) in enumerate(task_steps):
elif step['type'] == 'code_change':
# TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
if 'code_change' not in step:
data = step
else:
data = step['code_change']
self.project.save_file(data)
# TODO end
tries = 0
max_retry_times = 1
def step_command_run(self, convo, step, i):
# TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
if isinstance(step['command'], str):
data = step
else:
data = step['command']
# TODO END
additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
return run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message)
step_uuid = str(uuid.uuid4())
convo.save_branch(step_uuid)
def step_human_intervention(self, convo, step):
while True:
human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description']
response = self.project.ask_for_human_intervention('I need human intervention:',
human_intervention_description,
cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True, return_cli_response=True) })
while tries < max_retry_times:
tries += 1
try:
if reset_convo:
convo.load_branch(function_uuid)
if 'user_input' not in response:
continue
if max_retry_times > 1:
# this means that we are retrying the entire development step
convo.load_branch(step_uuid)
if response['user_input'] != 'continue':
return_value = self.debugger.debug(convo, user_input=response['user_input'], issue_description=step['human_intervention_description'])
return_value['user_input'] = response['user_input']
return return_value
else:
return response
if step['type'] == 'command':
# TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
if isinstance(step['command'], str):
data = step
else:
data = step['command']
# TODO END
additional_message = 'Let\'s start with the step #0:\n\n' if i == 0 else f'So far, steps { ", ".join(f"#{j}" for j in range(i)) } are finished so let\'s do step #{i + 1} now.\n\n'
run_command_until_success(data['command'], data['timeout'], convo, additional_message=additional_message)
def step_test(self, convo, test_command):
should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt',
test_command)
if should_rerun_command == 'NO':
return { "success": True }
elif should_rerun_command == 'YES':
cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
if llm_response == 'NEEDS_DEBUGGING':
print(colored(f'Got incorrect CLI response:', 'red'))
print(cli_response)
print(colored('-------------------', 'red'))
elif step['type'] == 'code_change' and 'code_change_description' in step:
# TODO this should be refactored so it always uses the same function call
print(f'Implementing code changes for `{step["code_change_description"]}`')
code_monkey = CodeMonkey(self.project, self)
updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i)
if test_after_code_changes:
self.test_code_changes(code_monkey, updated_convo)
elif step['type'] == 'code_change':
# TODO fix this - the problem is in GPT response that sometimes doesn't return the correct JSON structure
if 'code_change' not in step:
data = step
else:
data = step['code_change']
self.project.save_file(data)
# TODO end
elif step['type'] == 'human_intervention':
human_intervention_description = step['human_intervention_description'] + colored('\n\nIf you want to run the app, just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold']) if self.run_command is not None else step['human_intervention_description']
user_feedback = self.project.ask_for_human_intervention('I need human intervention:',
human_intervention_description,
cbs={ 'r': lambda: run_command_until_success(self.run_command, None, convo, force=True) })
if user_feedback is not None and user_feedback != 'continue':
debug(convo, user_input=user_feedback, issue_description=step['human_intervention_description'])
if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']):
should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt',
test_command)
if should_rerun_command == 'NO':
return True
elif should_rerun_command == 'YES':
cli_response, llm_response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
if llm_response == 'NEEDS_DEBUGGING':
print(colored(f'Got incorrect CLI response:', 'red'))
print(cli_response)
print(colored('-------------------', 'red'))
if llm_response == 'DONE':
return True
except TokenLimitError as e:
if max_retry_times >= MAX_COMMAND_DEBUG_TRIES:
print(colored('I can\'t figure this out - sorry! Closing...'))
exit(0)
print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold']))
print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold']))
answer = styled_text(
self.project,
'Type y/n'
)
if answer == 'y':
max_retry_times += 1
else:
print(colored('Ok - exiting...', 'red', attrs=['bold']))
exit(0)
return { "success": llm_response == 'DONE', "cli_response": cli_response, "llm_response": llm_response }
def task_postprocessing(self, convo, development_task, continue_development, task_result):
self.run_command = convo.send_message('development/get_run_command.prompt', {})
if self.run_command.startswith('`'):
self.run_command = self.run_command[1:]
@@ -160,22 +135,130 @@ class Developer(Agent):
if development_task is not None:
convo.remove_last_x_messages(2)
detailed_user_review_goal = convo.send_message('development/define_user_review_goal.prompt', {})
convo.remove_last_x_messages(2)
if continue_development:
continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None
self.continue_development(convo, continue_description)
try:
if continue_development:
continue_description = detailed_user_review_goal if detailed_user_review_goal is not None else None
return self.continue_development(convo, continue_description)
except TooDeepRecursionError as e:
return self.dev_help_needed(e.message)
return task_result
def should_retry_step_implementation(self, step, step_implementation_try):
if step_implementation_try >= MAX_COMMAND_DEBUG_TRIES:
self.dev_help_needed(step)
print(colored(f'\n--------- LLM Reached Token Limit ----------', 'red', attrs=['bold']))
print(colored(f'Can I retry implementing the entire development step?', 'red', attrs=['bold']))
answer = ''
while answer != 'y':
answer = styled_text(
self.project,
'Type y/n'
)
if answer == 'n':
return self.dev_help_needed(step)
return { "success": False, "retry": True }
def dev_help_needed(self, description):
# TODO remove this
def extract_substring(s):
start_idx = s.find('```')
end_idx = s.find('```', start_idx + 3)
if start_idx != -1 and end_idx != -1:
return s[start_idx + 3:end_idx]
else:
return s
# TODO end
answer = ''
while answer != 'continue':
print(colored(f'\n----------------------------- I need your help ------------------------------', 'red', attrs=['bold']))
print(colored(f'Please implement the following task', 'red', attrs=['bold']))
print(colored(extract_substring(description), 'red'))
print(colored(f'\n-----------------------------------------------------------------------------', 'red', attrs=['bold']))
answer = styled_text(
self.project,
'Once you\'re done, type "continue"?'
)
return { "success": True, "user_input": answer }
def execute_task(self, convo, task_steps, test_command=None, reset_convo=True,
test_after_code_changes=True, continue_development=False,
development_task=None, is_root_task=False):
function_uuid = str(uuid.uuid4())
convo.save_branch(function_uuid)
for (i, step) in enumerate(task_steps):
result = None
step_implementation_try = 0
while True:
try:
if reset_convo:
convo.load_branch(function_uuid)
if step['type'] == 'command':
result = self.step_command_run(convo, step, i)
elif step['type'] == 'code_change':
result = self.step_code_change(convo, step, i, test_after_code_changes)
elif step['type'] == 'human_intervention':
result = self.step_human_intervention(convo, step)
if test_command is not None and ('check_if_fixed' not in step or step['check_if_fixed']):
is_fixed = self.step_test(convo, test_command)
if is_fixed['success']:
return is_fixed
else:
result = is_fixed
break
except TokenLimitError as e:
if is_root_task:
response = self.should_retry_step_implementation(step, step_implementation_try)
if 'retry' in response:
# TODO we can rewind this convo even more
convo.load_branch(function_uuid)
continue
elif 'success' in response:
result = response
break
else:
raise e
except TooDeepRecursionError as e:
if is_root_task:
result = self.dev_help_needed(step)
break
else:
raise e
convo.load_branch(function_uuid)
return self.task_postprocessing(convo, development_task, continue_development, result)
def continue_development(self, iteration_convo, continue_description=''):
while True:
user_description = ('Here is a description of what should be working: \n\n' + colored(continue_description, 'blue', attrs=['bold']) + '\n') if continue_description != '' else ''
user_description = 'Can you check if the app works please? ' + user_description + '\nIf you want to run the app, ' + colored('just type "r" and press ENTER and that will run `' + self.run_command + '`', 'yellow', attrs=['bold'])
continue_description = ''
user_feedback = self.project.ask_for_human_intervention(
# continue_description = ''
response = self.project.ask_for_human_intervention(
user_description,
cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True) })
cbs={ 'r': lambda: run_command_until_success(self.run_command, None, iteration_convo, force=True, return_cli_response=True, is_root_task=True) })
user_feedback = response['user_input'] if 'user_input' in response else None
if user_feedback == 'continue':
return True
return { "success": True, "user_input": user_feedback }
if user_feedback is not None:
iteration_convo = AgentConvo(self)
@@ -192,11 +275,12 @@ class Developer(Agent):
"user_input": user_feedback,
})
# debug(iteration_convo, user_input=user_feedback)
# self.debugger.debug(iteration_convo, user_input=user_feedback)
task_steps = iteration_convo.send_message('development/parse_task.prompt', {}, IMPLEMENT_TASK)
iteration_convo.remove_last_x_messages(2)
self.execute_task(iteration_convo, task_steps, continue_development=False)
self.execute_task(iteration_convo, task_steps, is_root_task=True)
def set_up_environment(self):
@@ -274,17 +358,24 @@ class Developer(Agent):
GET_TEST_TYPE)
if test_type == 'command_test':
run_command_until_success(command['command'], command['timeout'], convo)
return run_command_until_success(command['command'], command['timeout'], convo)
elif test_type == 'automated_test':
code_monkey.implement_code_changes(convo, automated_test_description, 0)
# TODO get code monkey to implement the automated test
pass
elif test_type == 'manual_test':
# TODO make the message better
user_feedback = self.project.ask_for_human_intervention(
response = self.project.ask_for_human_intervention(
'Message from Pilot: I need your help. Can you please test if this was successful?',
manual_test_description
)
if user_feedback is not None:
debug(convo, user_input=user_feedback, issue_description=manual_test_description)
user_feedback = response['user_input']
if user_feedback is not None and user_feedback != 'continue':
return_value = self.debugger.debug(convo, user_input=user_feedback, issue_description=manual_test_description)
return_value['user_input'] = user_feedback
return return_value
else:
return { "success": True, "user_input": user_feedback }
def implement_step(self, convo, step_index, type, description):
# TODO remove hardcoded folder path

View File

@@ -10,6 +10,7 @@ import platform
from termcolor import colored
from database.database import get_command_run_from_hash_id, save_command_run
from const.function_calls import DEBUG_STEPS_BREAKDOWN
from helpers.exceptions.TooDeepRecursionError import TooDeepRecursionError
from utils.questionary import styled_text
from const.code_execution import MAX_COMMAND_DEBUG_TRIES, MIN_COMMAND_RUN_TIME, MAX_COMMAND_RUN_TIME, MAX_COMMAND_OUTPUT_LENGTH
@@ -251,7 +252,7 @@ def execute_command_and_check_cli_response(command, timeout, convo):
{ 'cli_response': cli_response, 'command': command })
return cli_response, response
def run_command_until_success(command, timeout, convo, additional_message=None, force=False):
def run_command_until_success(command, timeout, convo, additional_message=None, force=False, return_cli_response=False, is_root_task=False):
"""
Run a command until it succeeds or reaches a timeout.
@@ -271,55 +272,13 @@ def run_command_until_success(command, timeout, convo, additional_message=None,
print(cli_response)
print(colored('-------------------', 'red'))
debug(convo, {'command': command, 'timeout': timeout})
def debug(convo, command=None, user_input=None, issue_description=None):
"""
Debug a conversation.
Args:
convo (AgentConvo): The conversation object.
command (dict, optional): The command to debug. Default is None.
user_input (str, optional): User input for debugging. Default is None.
issue_description (str, optional): Description of the issue to debug. Default is None.
Returns:
bool: True if debugging was successful, False otherwise.
"""
function_uuid = str(uuid.uuid4())
convo.save_branch(function_uuid)
success = False
for i in range(MAX_COMMAND_DEBUG_TRIES):
if success:
break
convo.load_branch(function_uuid)
debugging_plan = convo.send_message('dev_ops/debug.prompt',
{ 'command': command['command'] if command is not None else None, 'user_input': user_input, 'issue_description': issue_description },
DEBUG_STEPS_BREAKDOWN)
# TODO refactor to nicely get the developer agent
success = convo.agent.project.developer.execute_task(
convo,
debugging_plan,
command,
False,
False)
if not success:
# TODO explain better how should the user approach debugging
# we can copy the entire convo to clipboard so they can paste it in the playground
user_input = convo.agent.project.ask_for_human_intervention(
'It seems like I cannot debug this problem by myself. Can you please help me and try debugging it yourself?' if user_input is None else f'Can you check this again:\n{issue_description}?',
command
)
if user_input == 'continue':
success = True
return success
try:
# This catch is necessary to return the correct value (cli_response) to continue development function so
# the developer can debug the appropriate issue
# this snippet represents the first entry point into debugging recursion because of return_cli_response
return convo.agent.debugger.debug(convo, {'command': command, 'timeout': timeout}, is_root_task=is_root_task)
except TooDeepRecursionError as e:
# this is only to put appropriate message in the response after TooDeepRecursionError is raised
raise TooDeepRecursionError(cli_response) if return_cli_response else e
else:
return { 'success': True, 'cli_response': cli_response }

View File

@@ -123,7 +123,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, min_tokens=MIN_TO
# Check if the error message is related to token limit
if "context_length_exceeded" in error_message.lower():
raise Exception(f'Too many tokens in the request. Please try to continue the project with some previous development step.')
raise TokenLimitError(tokens_in_messages + min_tokens, MAX_GPT_MODEL_TOKENS)
else:
print('The request to OpenAI API failed. Here is the error message:')
print(e)
@@ -153,7 +153,7 @@ def retry_on_exception(func):
# If the specific error "context_length_exceeded" is present, simply return without retry
if "context_length_exceeded" in err_str:
raise Exception("context_length_exceeded")
raise TokenLimitError(tokens_in_messages + min_tokens, MAX_GPT_MODEL_TOKENS)
print(colored(f'There was a problem with request to openai API:', 'red'))
print(err_str)