merge

2026-01-19 08:44:46 +01:00 · 2023-08-09 10:42:53 +02:00
parent 1d5c01a707 a86bb4a1fa
commit 3545830a51
20 changed files with 198 additions and 90 deletions
--- a/euclid/const/common.py
+++ b/euclid/const/common.py
@@ -26,4 +26,6 @@ IGNORE_FOLDERS = [
    'venv',
    'dist',
    'build',
-]
+]
+
+PROMPT_DATA_TO_IGNORE = {'directory_tree', 'name'}
--- a/euclid/const/function_calls.py
+++ b/euclid/const/function_calls.py
@@ -114,7 +114,7 @@ DEV_TASKS_BREAKDOWN = {
                                },
                                'command_timeout': {
                                    'type': 'number',
-                                    'description': 'Timeout in milliseconds that represent the approximate time the command takes to finish. This should be used only if the task is of a type "command". If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds.',
+                                    'description': 'Timeout in milliseconds that represent the approximate time the command takes to finish. This should be used only if the task is of a type "command". If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds. Remember, this is not in seconds but in milliseconds so likely it always needs to be greater than 1000.',
                                },
                                'code_change_description': {
                                    'type': 'string',
@@ -254,7 +254,7 @@ CODE_CHANGES = {
                                },
                                'command_timeout': {
                                    'type': 'number',
-                                    'description': 'Timeout in milliseconds that represent the approximate time the command takes to finish. This should be used only if the task is of a type "command". If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds.',
+                                    'description': 'Timeout in milliseconds that represent the approximate time the command takes to finish. This should be used only if the task is of a type "command". If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds. Remember, this is not in seconds but in milliseconds so likely it always needs to be greater than 1000.',
                                },
                                'code_change_description': {
                                    'type': 'string',
@@ -373,25 +373,34 @@ GET_FILES = {
 IMPLEMENT_CHANGES = {
    'definitions': [{
        'name': 'save_files',
-        'description': f'Iterates over the files passed to this function and saves them on the disk.',
+        'description': 'Iterates over the files passed to this function and saves them on the disk.',
        'parameters': {
            'type': 'object',
            'properties': {
                'files': {
                    'type': 'array',
-                    'description': f'List of files that need to be analized to implement the reqired changes.',
+                    'description': 'List of files that need to be saved.',
                    'items': {
                        'type': 'object',
                        'properties': {
                            'name': {
                                'type': 'string',
-                                'description': f'Name of the file that needs to be saved on the disk.',
+                                'description': 'Name of the file that needs to be saved on the disk.',
+                            },
+                            'path': {
+                                'type': 'string',
+                                'description': 'Path of the file that needs to be saved on the disk.',
                            },
                            'content': {
                                'type': 'string',
-                                'description': f'Full content of the file that needs to be saved on the disk.',
-                            }
-                        }
+                                'description': 'Full content of the file that needs to be saved on the disk.',
+                            },
+                            'description': {
+                                'type': 'string',
+                                'description': 'Description of the file that needs to be saved on the disk. This description doesn\'t need to explain what is being done currently in this task but rather what is the idea behind this file - what do we want to put in this file in the future. Write the description ONLY if this is the first time this file is being saved. If this file already exists on the disk, leave this field empty.',
+                            },
+                        },
+                        'required': ['name', 'path', 'content'],
                    }
                }
            },
@@ -400,7 +409,8 @@ IMPLEMENT_CHANGES = {
    }],
    'functions': {
        'save_files': lambda files: files
-    }
+    },
+    'to_message': lambda files: [f'File `{file["name"]}` saved to the disk and currently looks like this:\n```\n{file["content"]}\n```' for file in files]
 }

 GET_TEST_TYPE = {
@@ -412,8 +422,8 @@ GET_TEST_TYPE = {
            'properties': {
                'type': {
                    'type': 'string',
-                    'description': f'Type of a test that needs to be run.',
-                    'enum': ['automated_test', 'command_test', 'manual_test']
+                    'description': f'Type of a test that needs to be run. If this is just an intermediate step in getting a task done, put `no_test` as the type and we\'ll just go onto the next task without testing.',
+                    'enum': ['automated_test', 'command_test', 'manual_test', 'no_test']
                },
                'command': {
                    'type': 'object',
@@ -467,12 +477,19 @@ DEBUG_STEPS_BREAKDOWN = {
                                    'description': 'Type of the step that needs to be done to debug this issue.',
                                },
                                'command': {
-                                    'type': 'string',
-                                    'description': 'Command that needs to be complete this step in debugging. This should be used only if the task is of a type "command".',
-                                },
-                                'command_timeout': {
-                                    'type': 'number',
-                                    'description': 'Timeout in milliseconds that represent the approximate time the command takes to finish. This should be used only if the task is of a type "command". If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds.',
+                                    'type': 'object',
+                                    'description': 'Command that needs to be run to debug this issue. This should be used only if the step is of a type "command".',
+                                    'properties': {
+                                        'command': {
+                                            'type': 'string',
+                                            'description': 'Command that needs to be run to debug this issue.',
+                                        },
+                                        'timeout': {
+                                            'type': 'number',
+                                            'description': 'Timeout in milliseconds that represent the approximate time this command takes to finish. If you need to run a command that doesnt\'t finish by itself (eg. a command to run an app), put the timeout to 3000 milliseconds.',
+                                        }
+                                    },
+                                    'required': ['command', 'timeout'],
                                },
                                'code_change_description': {
                                    'type': 'string',
@@ -484,7 +501,7 @@ DEBUG_STEPS_BREAKDOWN = {
                                },
                                "check_if_fixed": {
                                    'type': 'boolean',
-                                    'description': 'Flag that indicates if the original command that triggered the error that\'s being debugged should be tried after this step to check if the error is fixed. If this step is just one step that can\'t fix the error by itself, then `check_if_fixed` should be FALSE. If this step can fix the error by itself, then `check_if_fixed` should be TRUE.',
+                                    'description': 'Flag that indicates if the original command that triggered the error that\'s being debugged should be tried after this step to check if the error is fixed. If you think that the original command `delete node_modules/ && delete package-lock.json` will pass after this step, then this flag should be set to TRUE and if you think that the original command will still fail after this step, then this flag should be set to FALSE.',
                                }
                            },
                            'required': ['type', 'check_if_fixed'],
--- a/euclid/database/database.py
+++ b/euclid/database/database.py
@@ -3,6 +3,7 @@ from peewee import *
 from termcolor import colored
 from functools import reduce
 import operator
+from const.common import PROMPT_DATA_TO_IGNORE

 from utils.utils import hash_data
 from database.models.components.base_models import database
@@ -19,6 +20,7 @@ from database.models.development import Development
 from database.models.file_snapshot import FileSnapshot
 from database.models.command_runs import CommandRuns
 from database.models.user_inputs import UserInputs
+from database.models.files import File


 def save_user(user_id, email, password):
@@ -152,20 +154,20 @@ def get_progress_steps(app_id, step=None):
        return steps


-def save_development_step(app_id, prompt_path, prompt_data, llm_req_num, messages, response):
+def save_development_step(app_id, prompt_path, prompt_data, llm_req_num, messages, response, previous_step=None):
    app = get_app(app_id)
    hash_id = hash_data({
        'prompt_path': prompt_path,
-        'prompt_data': {k: v for k, v in (prompt_data.items() if prompt_data is not None else {}) if k not in {"directory_tree"}},
+        'prompt_data': {k: v for k, v in (prompt_data.items() if prompt_data is not None else {}) if k not in PROMPT_DATA_TO_IGNORE},
        'llm_req_num': llm_req_num
    })
    try:
        inserted_id = (DevelopmentSteps
-                       .insert(app=app, hash_id=hash_id, messages=messages, llm_response=response)
-                       .on_conflict(conflict_target=[DevelopmentSteps.app, DevelopmentSteps.hash_id],
-                                    preserve=[DevelopmentSteps.messages, DevelopmentSteps.llm_response],
-                                    update={})
-                       .execute())
+            .insert(app=app, hash_id=hash_id, messages=messages, llm_response=response, previous_dev_step=previous_step)
+            .on_conflict(conflict_target=[DevelopmentSteps.app, DevelopmentSteps.hash_id],
+                        preserve=[DevelopmentSteps.messages, DevelopmentSteps.llm_response],
+                        update={})
+            .execute())

        dev_step = DevelopmentSteps.get_by_id(inserted_id)
        print(colored(f"Saved DEV step => {dev_step.id}", "yellow"))
@@ -199,8 +201,8 @@ def hash_and_save_step(Model, app_id, hash_data_args, data_fields, message):
        inserted_id = (Model
                       .insert(**data_to_insert)
                       .on_conflict(conflict_target=[Model.app, Model.hash_id],
-                                    preserve=[field for field in data_fields.keys()],
-                                    update={})
+                                    preserve=[],
+                                    update=data_fields)
                       .execute())

        record = Model.get_by_id(inserted_id)
@@ -219,8 +221,11 @@ def save_command_run(project, command, cli_response):
    data_fields = {
        'command': command,
        'cli_response': cli_response,
+        'previous_command_run': project.checkpoints['last_command_run'],
    }
-    return hash_and_save_step(CommandRuns, project.args['app_id'], hash_data_args, data_fields, "Saved Command Run")
+    command_run = hash_and_save_step(CommandRuns, project.args['app_id'], hash_data_args, data_fields, "Saved Command Run")
+    project.checkpoints['last_command_run'] = command_run
+    return command_run


 def get_command_run_from_hash_id(project, command):
@@ -228,7 +233,8 @@ def get_command_run_from_hash_id(project, command):
        'command': command,
        'command_runs_count': project.command_runs_count
    }
-    return get_db_model_from_hash_id(data_to_hash, CommandRuns, project.args['app_id'])
+    command_run = get_db_model_from_hash_id(data_to_hash, CommandRuns, project.args['app_id'])
+    return command_run

 def save_user_input(project, query, user_input):
    hash_data_args = {
@@ -238,22 +244,26 @@ def save_user_input(project, query, user_input):
    data_fields = {
        'query': query,
        'user_input': user_input,
+        'previous_user_input': project.checkpoints['last_user_input'],
    }
-    return hash_and_save_step(UserInputs, project.args['app_id'], hash_data_args, data_fields, "Saved User Input")
+    user_input = hash_and_save_step(UserInputs, project.args['app_id'], hash_data_args, data_fields, "Saved User Input")
+    project.checkpoints['last_user_input'] = user_input
+    return user_input

 def get_user_input_from_hash_id(project, query):
    data_to_hash = {
        'query': query,
        'user_inputs_count': project.user_inputs_count
    }
-    return get_db_model_from_hash_id(data_to_hash, UserInputs, project.args['app_id'])
+    user_input = get_db_model_from_hash_id(data_to_hash, UserInputs, project.args['app_id'])
+    return user_input


 def get_development_step_from_hash_id(app_id, prompt_path, prompt_data, llm_req_num):
    if prompt_data is None:
        prompt_data_dict = {}
    else:
-        prompt_data_dict = {k: v for k, v in prompt_data.items() if k not in {"directory_tree"}}
+        prompt_data_dict = {k: v for k, v in prompt_data.items() if k not in PROMPT_DATA_TO_IGNORE}

    hash_id = hash_data({
        'prompt_path': prompt_path,
@@ -285,6 +295,7 @@ def create_tables():
            FileSnapshot,
            CommandRuns,
            UserInputs,
+            File,
        ])


@@ -304,6 +315,7 @@ def drop_tables():
            FileSnapshot,
            CommandRuns,
            UserInputs,
+            File,
            ]:
            database.execute_sql(f'DROP TABLE IF EXISTS "{table._meta.table_name}" CASCADE')

--- a/euclid/database/models/command_runs.py
+++ b/euclid/database/models/command_runs.py
@@ -10,6 +10,7 @@ class CommandRuns(BaseModel):
    hash_id = CharField(null=False)
    command = TextField(null=True)
    cli_response = TextField(null=True)
+    previous_command_run = ForeignKeyField('self', null=True, column_name='previous_command_run')

    class Meta:
        db_table = 'command_runs'
--- a/euclid/database/models/development_steps.py
+++ b/euclid/database/models/development_steps.py
@@ -12,6 +12,7 @@ class DevelopmentSteps(BaseModel):
    hash_id = CharField(null=False)
    messages = BinaryJSONField(null=True)
    llm_response = BinaryJSONField(null=False)
+    previous_dev_step = ForeignKeyField('self', null=True, column_name='previous_dev_step')

    class Meta:
        db_table = 'development_steps'
--- a/euclid/database/models/files.py
+++ b/euclid/database/models/files.py
@@ -0,0 +1,16 @@
+from peewee import *
+
+from database.models.components.base_models import BaseModel
+from database.models.development_steps import DevelopmentSteps
+from database.models.app import App
+
+class File(BaseModel):
+    app = ForeignKeyField(App)
+    name = CharField()
+    path = CharField()
+    description = TextField()
+
+    class Meta:
+        indexes = (
+            (('app', 'name', 'path'), True),
+        )
--- a/euclid/database/models/user_inputs.py
+++ b/euclid/database/models/user_inputs.py
@@ -10,6 +10,7 @@ class UserInputs(BaseModel):
    hash_id = CharField(null=False)
    query = TextField(null=True)
    user_input = TextField(null=True)
+    previous_user_input = ForeignKeyField('self', null=True, column_name='previous_user_input')

    class Meta:
        db_table = 'user_inputs'
--- a/euclid/helpers/AgentConvo.py
+++ b/euclid/helpers/AgentConvo.py
@@ -36,13 +36,15 @@ class AgentConvo:
            if self.agent.project.skip_until_dev_step and str(development_step.id) == self.agent.project.skip_until_dev_step:
                self.agent.project.skip_steps = False
            print(colored(f'Restoring development step with id {development_step.id}', 'yellow'))
+            self.agent.project.checkpoints['last_development_step'] = development_step
            self.agent.project.restore_files(development_step.id)
            response = development_step.llm_response
            self.messages = development_step.messages
        else:
            # if we don't, get the response from LLM
            response = create_gpt_chat_completion(self.messages, self.high_level_step, function_calls=function_calls)
-            development_step = save_development_step(self.agent.project.args['app_id'], prompt_path, prompt_data, self.agent.project.llm_req_num, self.messages, response)
+            development_step = save_development_step(self.agent.project.args['app_id'], prompt_path, prompt_data, self.agent.project.llm_req_num, self.messages, response, self.agent.project.checkpoints['last_development_step'])
+            self.agent.project.checkpoints['last_development_step'] = development_step
            self.agent.project.save_files_snapshot(development_step.id)

        # TODO handle errors from OpenAI
@@ -54,7 +56,9 @@ class AgentConvo:
        # TODO remove this once the database is set up properly
        message_content = response[0] if type(response) == tuple else response
        if isinstance(message_content, list):
-            if len(message_content) > 0 and isinstance(message_content[0], dict):
+            if 'to_message' in function_calls:
+                string_response = function_calls['to_message'](message_content)
+            elif len(message_content) > 0 and isinstance(message_content[0], dict):
                string_response = [
                    f'#{i}\n' + array_of_objects_to_string(d)
                    for i, d in enumerate(message_content)
--- a/euclid/helpers/Project.py
+++ b/euclid/helpers/Project.py
@@ -2,6 +2,8 @@ import os

 from termcolor import colored
 from const.common import IGNORE_FOLDERS
+from database.models.app import App
+from database.database import get_app
 from utils.questionary import styled_text
 from helpers.files import get_files_content, clear_directory
 from helpers.cli import build_directory_tree
@@ -12,6 +14,7 @@ from helpers.agents.ProductOwner import ProductOwner

 from database.models.development_steps import DevelopmentSteps
 from database.models.file_snapshot import FileSnapshot
+from database.models.files import File
 from utils.files import get_parent_folder


@@ -22,6 +25,11 @@ class Project:
        self.llm_req_num = 0
        self.command_runs_count = 0
        self.user_inputs_count = 0
+        self.checkpoints = {
+            'last_user_input': None,
+            'last_command_run': None,
+            'last_development_step': None,
+        }
        self.skip_steps = False if ('skip_until_dev_step' in args and args['skip_until_dev_step'] == '0') else True
        self.skip_until_dev_step = args['skip_until_dev_step'] if 'skip_until_dev_step' in args else None
        # TODO make flexible
@@ -29,6 +37,9 @@ class Project:
        self.root_path = ''
        # self.restore_files({dev_step_id_to_start_from})

+        if 'app_id' in args:
+            self.app = get_app(args['app_id'])
+
        if current_step is not None:
            self.current_step = current_step
        if name is not None:
@@ -61,8 +72,12 @@ class Project:

        self.developer.start_coding()

-    def get_directory_tree(self):
-        return build_directory_tree(self.root_path + '/', ignore=IGNORE_FOLDERS)
+    def get_directory_tree(self, with_descriptions=False):
+        files = {}
+        if with_descriptions:
+            files = File.select().where(File.app_id == self.args['app_id'])
+            files = {snapshot.name: snapshot for snapshot in files}
+        return build_directory_tree(self.root_path + '/', ignore=IGNORE_FOLDERS, files=files, add_descriptions=True)

    def get_test_directory_tree(self):
        # TODO remove hardcoded path
@@ -73,12 +88,15 @@ class Project:
        for file in files:
            files_with_content.append({
                "path": file,
-                "content": open(self.get_full_file_path(file), 'r').read()
+                "content": open(self.get_full_file_path('', file), 'r').read()
            })
        return files_with_content

-    def get_full_file_path(self, file_name):
-        return self.root_path + '/' + file_name
+    def get_full_file_path(self, file_path, file_name):
+        file_path = file_path.replace('./', '', 1).rstrip(file_name)
+        if not file_path.endswith('/'):
+            file_path = file_path + '/'
+        return self.root_path + file_path + file_name

    def save_files_snapshot(self, development_step_id):
        files = get_files_content(self.root_path, ignore=IGNORE_FOLDERS)
--- a/euclid/helpers/agents/CodeMonkey.py
+++ b/euclid/helpers/agents/CodeMonkey.py
@@ -1,7 +1,6 @@
 from const.function_calls import GET_FILES, DEV_STEPS, IMPLEMENT_CHANGES, CODE_CHANGES
+from database.models.files import File
 from helpers.files import update_file
-from helpers.cli import run_command_until_success
-from helpers.cli import build_directory_tree
 from helpers.AgentConvo import AgentConvo
 from helpers.Agent import Agent

@@ -10,13 +9,13 @@ class CodeMonkey(Agent):
        super().__init__('code_monkey', project)
        self.developer = developer

-    def implement_code_changes(self, convo, code_changes_description, step_index):
+    def implement_code_changes(self, convo, code_changes_description, step_index=0):
        if convo == None:
            convo = AgentConvo(self)

        files_needed = convo.send_message('development/task/request_files_for_code_changes.prompt', {
            "step_description": code_changes_description,
-            "directory_tree": self.project.get_directory_tree(),
+            "directory_tree": self.project.get_directory_tree(True),
            "step_index": step_index,
            "finished_steps": ', '.join(f"#{j}" for j in range(step_index))
        }, GET_FILES)
@@ -24,11 +23,21 @@ class CodeMonkey(Agent):

        changes = convo.send_message('development/implement_changes.prompt', {
            "instructions": code_changes_description,
-            "directory_tree": self.project.get_directory_tree(),
+            "directory_tree": self.project.get_directory_tree(True),
            "files": self.project.get_files(files_needed),
-        }, IMPLEMENT_CHANGES)
+        }, IMPLEMENT_CHANGES, True)

        for file_data in changes:
-            update_file(self.project.get_full_file_path(file_data['name']), file_data['content'])
+            file_data['full_path'] = self.project.get_full_file_path(file_data['path'], file_data['name'])
+
+            if file_data['description'] != '':
+                    (File.insert(app=self.project.app, path=file_data['path'], name=file_data['name'], description=file_data['description'])
+                        .on_conflict(
+                            conflict_target=[File.app, File.name, File.path],
+                            preserve=[],
+                            update={'description': file_data['description']})
+                        .execute())
+
+            update_file(file_data['full_path'], file_data['content'])

        return convo
--- a/euclid/helpers/agents/Developer.py
+++ b/euclid/helpers/agents/Developer.py
@@ -1,4 +1,5 @@
 import json
+import uuid
 from termcolor import colored
 from utils.utils import step_already_finished
 from helpers.agents.CodeMonkey import CodeMonkey
@@ -33,8 +34,9 @@ class Developer(Agent):
    def implement_task(self, sibling_tasks, current_task_index, parent_task=None):
        print(colored('-------------------------', 'green'))
        print(colored(f"Implementing task {current_task_index + 1}...\n", "green"))
-        print(sibling_tasks[current_task_index]['description'])
+        print(colored(sibling_tasks[current_task_index]['description'], 'green'))
        print(colored('-------------------------', 'green'))
+
        convo_dev_task = AgentConvo(self)
        task_steps = convo_dev_task.send_message('development/task/breakdown.prompt', {
            "name": self.project.args['name'],
@@ -44,7 +46,7 @@ class Developer(Agent):
            "user_tasks": self.project.user_tasks,
            "technologies": self.project.architecture,
            "array_of_objects_to_string": array_of_objects_to_string,
-            "directory_tree": self.project.get_directory_tree(),
+            "directory_tree": self.project.get_directory_tree(True),
            "current_task_index": current_task_index,
            "sibling_tasks": sibling_tasks,
            "parent_task": parent_task,
@@ -52,32 +54,40 @@ class Developer(Agent):

        self.execute_task(convo_dev_task, task_steps)

-    def execute_task(self, convo, task_steps, test_command=None, reset_convo=True):
-        convo.save_branch('after_task_breakdown')
+    def execute_task(self, convo, task_steps, test_command=None, reset_convo=True, test_after_code_changes=True):
+        function_uuid = str(uuid.uuid4())
+        convo.save_branch(function_uuid)

        for (i, step) in enumerate(task_steps):
            if reset_convo:
-                convo.load_branch('after_task_breakdown')
+                convo.load_branch(function_uuid)

            if step['type'] == 'command':
-                run_command_until_success(step['command'], step['command_timeout'], convo)
+                run_command_until_success(step['command']['command'], step['command']['timeout'], convo)

            elif step['type'] == 'code_change':
                print(f'Implementing code changes for `{step["code_change_description"]}`')
                code_monkey = CodeMonkey(self.project, self)
                updated_convo = code_monkey.implement_code_changes(convo, step['code_change_description'], i)
-                self.test_code_changes(code_monkey, updated_convo)
+                if test_after_code_changes:
+                    self.test_code_changes(code_monkey, updated_convo)

            elif step['type'] == 'human_intervention':
                self.project.ask_for_human_intervention('I need your help! Can you try debugging this yourself and let me take over afterwards? Here are the details about the issue:', step['human_intervention_description'])
-
-            else:
-                raise Exception('Step type must be either run_command or code_change.')
            
            if test_command is not None and step['check_if_fixed']:
-                response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
-                if response == 'DONE':
+                should_rerun_command = convo.send_message('dev_ops/should_rerun_command.prompt',
+                    test_command)
+                if should_rerun_command == 'NO':
                    return True
+                elif should_rerun_command == 'YES':
+                    response = execute_command_and_check_cli_response(test_command['command'], test_command['timeout'], convo)
+                    if response == 'NEEDS_DEBUGGING':
+                        print(colored(f'Got incorrect CLI response:', 'red'))
+                        print(response)
+                        print(colored('-------------------', 'red'))
+                    if response == 'DONE':
+                        return True

    def set_up_environment(self):
        self.project.current_step = 'environment_setup'
@@ -140,7 +150,10 @@ class Developer(Agent):
        # ENVIRONMENT SETUP END

    def test_code_changes(self, code_monkey, convo):
-        (test_type, command, automated_test_description, manual_test_description) = convo.send_message('development/task/step_check.prompt', {}, GET_TEST_TYPE)
+        (test_type, command, automated_test_description, manual_test_description) = convo.send_message(
+            'development/task/step_check.prompt',
+            {},
+            GET_TEST_TYPE)
        
        if test_type == 'command_test':
            run_command_until_success(command['command'], command['timeout'], convo)
@@ -148,14 +161,16 @@ class Developer(Agent):
            code_monkey.implement_code_changes(convo, automated_test_description, 0)
        elif test_type == 'manual_test':
            # TODO make the message better
-            self.project.ask_for_human_intervention(
+            response = self.project.ask_for_human_intervention(
                'Message from Euclid: I need your help. Can you please test if this was successful?',
                manual_test_description
            )
+            if response is not None and response != 'DONE':
+                self.test_code_changes(code_monkey, convo)

    def implement_step(self, convo, step_index, type, description):
        # TODO remove hardcoded folder path
-        directory_tree = self.project.get_directory_tree()
+        directory_tree = self.project.get_directory_tree(True)
        step_details = convo.send_message('development/task/next_step.prompt', {
            'finished_steps': [],
            'step_description': description,
--- a/euclid/helpers/cli.py
+++ b/euclid/helpers/cli.py
@@ -4,6 +4,7 @@ import signal
 import threading
 import queue
 import time
+import uuid

 from termcolor import colored
 from database.database import get_command_run_from_hash_id, save_command_run
@@ -39,12 +40,15 @@ def run_command(command, root_path, q_stdout, q_stderr, pid_container):

 def execute_command(project, command, timeout=5000):
    # check if we already have the command run saved
+    if timeout < 1000:
+        timeout *= 1000
    timeout = max(timeout, 2000)
    print(colored(f'Can i execute the command: `{command}` with {timeout}ms timeout?', 'white', attrs=['bold']))
    project.command_runs_count += 1
    command_run = get_command_run_from_hash_id(project, command)
    if command_run is not None and project.skip_steps:
        # if we do, use it
+        project.checkpoints['last_command_run'] = command_run
        print(colored(f'Restoring command run response id {command_run.id}:\n```\n{command_run.cli_response}```', 'yellow'))
        return command_run.cli_response

@@ -67,6 +71,7 @@ def execute_command(project, command, timeout=5000):

    while True and return_value is None:
        elapsed_time = time.time() - start_time
+        print(colored(f'\rt: {round(elapsed_time * 1000)}ms', 'white', attrs=['bold']), end='', flush=True)
        # Check if process has finished
        if process.poll() is not None:
            # Get remaining lines from the queue
@@ -106,7 +111,7 @@ def execute_command(project, command, timeout=5000):

    return return_value

-def build_directory_tree(path, prefix="", ignore=None, is_last=False):
+def build_directory_tree(path, prefix="", ignore=None, is_last=False, files=None, add_descriptions=False):
    """Build the directory tree structure in tree-like format.

    Args:
@@ -129,17 +134,17 @@ def build_directory_tree(path, prefix="", ignore=None, is_last=False):

    if os.path.isdir(path):
        # It's a directory, add its name to the output and then recurse into it
-        output += prefix + "|-- " + os.path.basename(path) + "/\n"
+        output += prefix + "|-- " + os.path.basename(path) + ((' - ' + files[os.path.basename(path)].description + ' ' if files and os.path.basename(path) in files and add_descriptions else '')) + "/\n"

        # List items in the directory
        items = os.listdir(path)
        for index, item in enumerate(items):
            item_path = os.path.join(path, item)
-            output += build_directory_tree(item_path, prefix + indent, ignore, index == len(items) - 1)
+            output += build_directory_tree(item_path, prefix + indent, ignore, index == len(items) - 1, files, add_descriptions)

    else:
        # It's a file, add its name to the output
-        output += prefix + "|-- " + os.path.basename(path) + "\n"
+        output += prefix + "|-- " + os.path.basename(path) + ((' - ' + files[os.path.basename(path)].description + ' ' if files and os.path.basename(path) in files and add_descriptions else '')) + "\n"

    return output

@@ -150,16 +155,18 @@ def execute_command_and_check_cli_response(command, timeout, convo):
    return response

 def run_command_until_success(command, timeout, convo):
-    command_executed = False
-    for i in range(MAX_COMMAND_DEBUG_TRIES):
-        cli_response = execute_command(convo.agent.project, command, timeout)
-        response = convo.send_message('dev_ops/ran_command.prompt',
-            {'cli_response': cli_response, 'command': command})
+    cli_response = execute_command(convo.agent.project, command, timeout)
+    response = convo.send_message('dev_ops/ran_command.prompt',
+        {'cli_response': cli_response, 'command': command})
+    command_successfully_executed = response == 'DONE'

-        command_executed = response == 'DONE'
-        if command_executed:
+    function_uuid = str(uuid.uuid4())
+    convo.save_branch(function_uuid)
+    for i in range(MAX_COMMAND_DEBUG_TRIES):
+        if command_successfully_executed:
            break

+        convo.load_branch(function_uuid)
        print(colored(f'Got incorrect CLI response:', 'red'))
        print(cli_response)
        print(colored('-------------------', 'red'))
@@ -168,13 +175,15 @@ def run_command_until_success(command, timeout, convo):
            DEBUG_STEPS_BREAKDOWN)

        # TODO refactor to nicely get the developer agent
-        convo.agent.project.developer.execute_task(
+        command_successfully_executed = convo.agent.project.developer.execute_task(
            convo,
            debugging_plan,
            {'command': command, 'timeout': timeout},
+            False,
            False)

-    if not command_executed:
+
+    if not command_successfully_executed:
        # TODO explain better how should the user approach debugging
        # we can copy the entire convo to clipboard so they can paste it in the playground
        convo.agent.project.ask_for_human_intervention(
--- a/euclid/helpers/files.py
+++ b/euclid/helpers/files.py
@@ -1,9 +1,6 @@
 from termcolor import colored
 import os

-from database.models.development_steps import DevelopmentSteps
-from database.models.file_snapshot import FileSnapshot
-

 def update_file(path, new_content):
    # Ensure the directory exists; if not, create it
--- a/euclid/prompts/dev_ops/debug.prompt
+++ b/euclid/prompts/dev_ops/debug.prompt
@@ -1,5 +1,7 @@
-{% if debugging_try_num == 0 %}
-Ok, we need to debug this issue so we can execute `{{ command }}` successfully. In case you cannot debug this by running any command and need a human assistance, respond with `NEED_HUMAN`. Write a step by step explanation of what needs to be done that will debug this issue
-{% else %}
-I've tried all you suggested but it's still not working. Can you suggest other things I can try to debug this issue?
-{% endif %}
+Ok, we need to debug this issue so we can execute `{{ command }}` successfully. I want you to debug this issue by yourself and I will give you 2 functions that you can use - `run_command` and `implement_code_changes`.
+
+`run_command` function will run a command on the machine and will return the CLI output to you so you can see what to do next.
+
+`implement_code_changes` function will change the code where you just need to thoroughly describe what needs to be implmemented, I will implement the requested changes and let you know.
+
+After this, you need to deside what to do next. You can rerun the command `{{ command }}` to check if the problem is fixed or run another command with `run_command` or change more code with `implement_code_changes`.
--- a/euclid/prompts/dev_ops/should_rerun_command.prompt
+++ b/euclid/prompts/dev_ops/should_rerun_command.prompt
@@ -0,0 +1 @@
+Should I rerun the command `{{ command }}` or is this task done? If I should rerun `{{ command }}`, respond only with YES. If I don't need to rerun the command but continue fixing the problem, respond with NEEDS_DEBUGGING and if I don't need to rerun the command and the original problem is fixed, respond with NO.
--- a/euclid/prompts/development/task/breakdown.prompt
+++ b/euclid/prompts/development/task/breakdown.prompt
@@ -67,6 +67,11 @@ You can get the list of files by calling `get_files` function.
 {% else %}
 #}

+Here are all the file that are written so far in a file tree format:
+```
+{{ directory_tree }}
+```
+
 First, just make a list of steps we need to do to fulfill this task. It should be in a JSON array. Every step must NOT contain both a command that needs to be run and the code that needs to be changed. It can be either command (or multiple commands) that need to be run or a change in the code.
 {#
 Each step must start with a keyword `command` in case the step consists of commands that need to be run or `code_change` in case it consists of changes in the code. After the keyword, write a description of what will be done in that step. Do not write what needs to be done for each step but only list them in an array.
--- a/euclid/prompts/development/task/next_step.prompt
+++ b/euclid/prompts/development/task/next_step.prompt
@@ -1,9 +1,7 @@
 {% if step_index != 0 %}
 So far, steps {{ finished_steps }} are finished so let's do
 {% else %}
-Let's start with the
-{% endif %}
-step #{{ step_index }}:
+Let's start with the{% endif %} step #{{ step_index }}:
 ```
 {{ step_description }}
 ```
--- a/euclid/prompts/development/task/step_check.prompt
+++ b/euclid/prompts/development/task/step_check.prompt
@@ -1,8 +1,8 @@
 Now, we need to verify if this change was successfully implemented. We can do that in 3 ways:
-1. By writing an automated test or by running a previously written test - this is the preferred way since we can then test if this functionality works in the future. You write automated tests in Jest and you always try finding a way to test a functionality with an automated test. Even if changes seem visual or UI-based, try to find a way to validate them using an automated test, such as verifying HTTP responses or elements rendered on the page. If you choose this type of test, make sure that you describe it in as much details as needed so that when someone looks at this test can know exactly what needs to be done to implement this automated test.
+1. By writing an automated test or by running a previously written test - you write automated tests in Jest and you always try finding a way to test a functionality with an automated test. Even if changes seem visual or UI-based, try to find a way to validate them using an automated test, such as verifying HTTP responses or elements rendered on the page. If you choose this type of test, make sure that you describe it in as much details as needed so that when someone looks at this test can know exactly what needs to be done to implement this automated test.

 2. By running a command (or multiple commands) - this is good for when an automated test is an overkill. For example, if we installed a new package or changed some configuration. Keep in mind that in this case, there shouldn't be any human intervention needed - I will run the commands you will give me and show you the CLI output and from that, you should be able to determine if the test passed or failed.

-3. By requesting that a human checks if everything works as expected - this is the last option that we want to avoid but if we can't test the functionality programmatically, we should ask a human to check if it works as expected. For example, if something was visually changed in the UI.
+3. By requesting that a human checks if everything works as expected - this is the last option that we want to avoid but if we can't test the functionality programmatically, we should ask a human to check if it works as expected. For example, if something was visually changed in the UI. If you have any option to test the code change with an automated test or a command, you always do it. Manual test is the last resort that should be avoided if possible.

 Ok, now, tell me how can we verify if this change was successful and respond only with a keyword for a type of test.    
--- a/euclid/utils/llm_connection.py
+++ b/euclid/utils/llm_connection.py
@@ -106,8 +106,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, min_tokens=MIN_TO
        return response
    except Exception as e:
        print(
-            'The request to OpenAI API failed. Might be due to GPT being down or due to the too large message. It\'s '
-            'best if you try again.')
+            'The request to OpenAI API failed. Here is the error message:')
        print(e)


--- a/euclid/utils/questionary.py
+++ b/euclid/utils/questionary.py
@@ -23,6 +23,7 @@ def styled_text(project, question):
    user_input = get_user_input_from_hash_id(project, question)
    if user_input is not None and project.skip_steps:
        # if we do, use it
+        project.checkpoints['last_user_input'] = user_input
        print(colored(f'Restoring user input id {user_input.id}: ', 'yellow'), end='')
        print(colored(f'{user_input.user_input}', 'yellow', attrs=['bold']))
        return user_input.user_input
				`@@ -0,0 +1 @@`
				Should I rerun the command `{{ command }}` or is this task done? If I should rerun `{{ command }}`, respond only with YES. If I don't need to rerun the command but continue fixing the problem, respond with NEEDS_DEBUGGING and if I don't need to rerun the command and the original problem is fixed, respond with NO.