From 6c68b798295d9a0d3c2fcd46c22af8da8e9d130c Mon Sep 17 00:00:00 2001 From: Hec Date: Mon, 7 Jul 2025 22:10:19 +1000 Subject: [PATCH] make it work --- .gitignore | 6 +- DOCKER_LOGGING_FIX.md | 102 ++++++++++++++ FILE_LOCATION_DEBUG.md | 129 ++++++++++++++++++ app/agents/agents_manager.py | 28 +++- .../test_analysis_agent/docker_utils.py | 7 + .../test_analysis_agent.py | 34 ++++- app/globals.py | 14 ++ app/main.py | 78 ++++++++++- app/model/common.py | 70 ++++++++-- app/task.py | 13 ++ debug_file_locations.py | 97 +++++++++++++ evaluation/docker_build.py | 14 ++ requirements.txt | 2 +- test_results_path_fix.py | 117 ++++++++++++++++ tests/test_docker_name_fix.py | 44 ++++++ 15 files changed, 726 insertions(+), 29 deletions(-) create mode 100644 DOCKER_LOGGING_FIX.md create mode 100644 FILE_LOCATION_DEBUG.md create mode 100644 debug_file_locations.py create mode 100644 test_results_path_fix.py create mode 100644 tests/test_docker_name_fix.py diff --git a/.gitignore b/.gitignore index f0011289f..e921ca410 100644 --- a/.gitignore +++ b/.gitignore @@ -162,9 +162,13 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +input/ evaluation/reports/ output/ # *data_collection/collect/*.sh *data_collection/collect/temp *evaluation/temp -temp/ \ No newline at end of file +temp/ + + +ignore.* \ No newline at end of file diff --git a/DOCKER_LOGGING_FIX.md b/DOCKER_LOGGING_FIX.md new file mode 100644 index 000000000..45c5f4746 --- /dev/null +++ b/DOCKER_LOGGING_FIX.md @@ -0,0 +1,102 @@ +# Docker Logging Configuration Fix + +## Issue Description + +The Docker containers created by the SWE Factory tool were using a logging configuration that disabled container logs: + +```json +"ContainerIDFile": "", +"LogConfig": { + "Type": "none", + "Config": {} +} +``` + +This configuration meant: +- No container logs were being captured +- `docker logs ` would not work +- Debugging container issues was difficult +- The tool couldn't capture container output for analysis + +## Root Cause + +The issue was in the container creation code in two files: +1. `app/agents/test_analysis_agent/docker_utils.py` - `build_container()` function +2. `evaluation/docker_build.py` - `build_container()` and `build_setup_container()` functions + +These functions were creating containers without explicit logging configuration, causing Docker to use default settings that might be set to `"none"` in some environments. + +## Solution Applied + +I've updated all container creation calls to include proper logging configuration: + +### Before: +```python +container = client.containers.create( + image=test_image_name, + name=test_container_name, + user="root", + detach=True, + command="tail -f /dev/null", + nano_cpus=None, + platform="linux/x86_64", +) +``` + +### After: +```python +container = client.containers.create( + image=test_image_name, + name=test_container_name, + user="root", + detach=True, + command="tail -f /dev/null", + nano_cpus=None, + platform="linux/x86_64", + log_config={ + "Type": "json-file", + "Config": { + "max-size": "10m", + "max-file": "3" + } + } +) +``` + +## Benefits of the Fix + +1. **Container Logs Available**: You can now use `docker logs ` to view container output +2. **Better Debugging**: Container issues can be diagnosed more easily +3. **Log Rotation**: Logs are automatically rotated when they reach 10MB +4. **Storage Management**: Only 3 log files are kept per container +5. **Tool Functionality**: The SWE Factory tool can now capture and analyze container output + +## Files Modified + +1. `app/agents/test_analysis_agent/docker_utils.py` - Line 330-340 +2. `evaluation/docker_build.py` - Lines 590-600 and 650-660 + +## Testing the Fix + +After applying this fix, you should be able to: + +1. Run the SWE Factory tool as usual +2. Use `docker logs ` to view container logs +3. See container output in the tool's log files +4. Debug container issues more effectively + +## Recommended Docker Configuration + +For optimal performance, ensure your Docker daemon is configured with: + +```json +{ + "log-driver": "json-file", + "log-opts": { + "max-size": "10m", + "max-file": "3" + } +} +``` + +This ensures consistent logging behavior across all containers, even those created without explicit logging configuration. \ No newline at end of file diff --git a/FILE_LOCATION_DEBUG.md b/FILE_LOCATION_DEBUG.md new file mode 100644 index 000000000..63aff81d4 --- /dev/null +++ b/FILE_LOCATION_DEBUG.md @@ -0,0 +1,129 @@ +# File Location Debug Guide + +## Issue Description + +When running the SWE Factory tool, files are being created at the repository root instead of in the specified output directories, despite providing the correct `--output-dir`, `--setup-dir`, and `--results-path` parameters. + +## Root Cause Analysis + +After analyzing the codebase, I found that the system is correctly designed to use the specified output directories. All file writing operations use proper path construction with `pjoin()` or `os.path.join()` to ensure files are written to the correct output directories. + +However, there are several potential causes for files appearing in the wrong location: + +### 1. Working Directory Changes +The code uses `cd` context managers in several places (like in `dump_cost` function), which temporarily change the working directory. If any file operations happen outside of these context managers while the directory is changed, they might write to the wrong location. + +### 2. Race Conditions +The system uses multiprocessing, and there might be race conditions where the working directory is changed in one process while another process is writing files. + +### 3. Missing Absolute Paths +Some file operations might not be using absolute paths, causing them to write relative to the current working directory. + +## Changes Made + +I've made the following improvements to ensure files are written to the correct locations: + +### 1. Added Absolute Path Safety Checks +- Modified `run_raw_task()` to ensure `task_output_dir` is absolute +- Modified `do_inference()` to ensure `task_output_dir` is absolute +- Modified `dump_cost()` to ensure `task_output_dir` is absolute + +### 2. Added Debug Logging +- Added logging in `AgentsManager` to track where Dockerfile, eval.sh, and status.json are written +- Added logging in `TestAnalysisAgent` to track where Dockerfile and eval.sh are written + +### 3. Created Debug Script +- Created `debug_file_locations.py` to monitor file creation during execution + +## How to Debug the Issue + +### Step 1: Run with Debug Logging +The enhanced logging will now show exactly where files are being written. Look for log messages like: +``` +Writing Dockerfile to: /path/to/output/dir/Dockerfile +Writing eval.sh to: /path/to/output/dir/eval.sh +Writing status.json to: /path/to/output/dir/status.json +``` + +### Step 2: Use the Debug Script +Run the debug script in a separate terminal to monitor file creation: + +```bash +# In one terminal, start the debug script +python debug_file_locations.py output/swe-factory-runs/kareldb-test 600 10 + +# In another terminal, run your command +LITELLM_API_BASE="https://api.dev.halo.engineer/v1/ai" \ +OPENAI_API_KEY="${OPENAI_API_KEY?->Need a key}" \ +PYTHONPATH=. python app/main.py local-issue \ + --task-id "kareldb-connection-1" \ + --local-repo "/Users/hector.maldonado@clearroute.io/xynova/kareldb-cp" \ + --issue-file "input/kareldb_test_issue.txt" \ + --model google/gemini-2.5-flash \ + --output-dir "output/swe-factory-runs/kareldb-test" \ + --setup-dir "output/swe-factory-runs/testbed" \ + --results-path "output/swe-factory-runs/results" \ + --conv-round-limit 3 \ + --num-processes 1 \ + --model-temperature 0.2 +``` + +The debug script will: +- Monitor file creation every 10 seconds for 10 minutes +- Log all new files created +- Warn about files created outside the expected output directory +- Show the current working directory at each check + +### Step 3: Check the Logs +Look for: +1. **Expected behavior**: Files being written to the specified output directory +2. **Unexpected behavior**: Files being written to the current working directory or repository root +3. **Working directory changes**: Any unexpected changes in the current working directory + +## Expected File Locations + +Based on your command, files should be created in: + +- **Task output files**: `output/swe-factory-runs/kareldb-test/kareldb-connection-1/` + - `Dockerfile` + - `eval.sh` + - `status.json` + - `cost.json` + - `meta.json` + - `problem_statement.txt` + - `developer_patch.diff` + - `info.log` + - `test_analysis_agent_0/` (subdirectory with test results) + +- **Setup directory**: `output/swe-factory-runs/testbed/` + - Repository clones and working directories + +- **Results**: `output/swe-factory-runs/results/results.json` + - Aggregated results from all tasks + +## Troubleshooting + +If files are still being created in the wrong location: + +1. **Check the debug logs** to see exactly where files are being written +2. **Verify the output directory exists** and is writable +3. **Check for any error messages** about directory creation or file writing +4. **Ensure no other processes** are changing the working directory +5. **Verify the command line arguments** are being parsed correctly + +## Additional Recommendations + +1. **Use absolute paths** in your command line arguments +2. **Ensure the output directories exist** before running the command +3. **Check file permissions** on the output directories +4. **Monitor system resources** to ensure there are no disk space issues + +## Code Changes Summary + +The following files were modified to improve file location handling: + +- `app/main.py`: Added absolute path safety checks +- `app/agents/agents_manager.py`: Added debug logging for file creation +- `app/agents/test_analysis_agent/test_analysis_agent.py`: Added debug logging for file creation +- `debug_file_locations.py`: Created debug script for monitoring file creation +- `FILE_LOCATION_DEBUG.md`: This documentation file \ No newline at end of file diff --git a/app/agents/agents_manager.py b/app/agents/agents_manager.py index c86e060a7..6789e9fa5 100644 --- a/app/agents/agents_manager.py +++ b/app/agents/agents_manager.py @@ -49,7 +49,7 @@ def __init__(self, client: docker.DockerClient, start_time: datetime, max_iteration_num: int, - results_path:str, + results_path: str | None, disable_memory_pool:bool, disable_context_retrieval:bool, disable_run_test:bool, @@ -79,7 +79,19 @@ def __init__(self, self.set_agent_status("context_retrieval_agent",True) self.agents_dict['test_analysis_agent'].disable_context_retrieval= disable_context_retrieval self.agents_dict['test_analysis_agent'].disable_run_test = disable_run_test - self.results_file = f'{results_path}/results.json' + + # Handle None results_path by setting a default + if results_path is None: + results_path = os.path.join(output_dir, "results") + + # Ensure results_path is absolute + if not os.path.isabs(results_path): + results_path = os.path.abspath(results_path) + + # Create the results directory if it doesn't exist + os.makedirs(results_path, exist_ok=True) + + self.results_file = os.path.join(results_path, 'results.json') lock_path = self.results_file + '.lock' self.lock = FileLock(lock_path, timeout=30) with self.lock: @@ -263,15 +275,21 @@ def run_workflow(self) -> None: eval_script_content = self.agents_dict['write_eval_script_agent'].get_latest_eval_script() eval_script_skeleton_content = self.agents_dict['write_eval_script_agent'].get_latest_eval_script_skeleton() if dockerfile_content and eval_script_content: - with open(os.path.join(self.output_dir, "Dockerfile"), "w") as dockerfile_f: + dockerfile_path = os.path.join(self.output_dir, "Dockerfile") + logger.info(f"Writing Dockerfile to: {dockerfile_path}") + with open(dockerfile_path, "w") as dockerfile_f: dockerfile_f.write(dockerfile_content) - with open(os.path.join(self.output_dir, "eval.sh"), "w") as eval_script_f: + eval_script_path = os.path.join(self.output_dir, "eval.sh") + logger.info(f"Writing eval.sh to: {eval_script_path}") + with open(eval_script_path, "w") as eval_script_f: eval_script_f.write(eval_script_content) - with open(os.path.join(self.output_dir, "status.json"), "w") as status_file_f: + status_file_path = os.path.join(self.output_dir, "status.json") + logger.info(f"Writing status.json to: {status_file_path}") + with open(status_file_path, "w") as status_file_f: json.dump({"is_finish": self.workflow_finish_status}, status_file_f) if self.workflow_finish_status: diff --git a/app/agents/test_analysis_agent/docker_utils.py b/app/agents/test_analysis_agent/docker_utils.py index 8a6a8e04c..f216d1927 100644 --- a/app/agents/test_analysis_agent/docker_utils.py +++ b/app/agents/test_analysis_agent/docker_utils.py @@ -339,6 +339,13 @@ def build_container(client,test_image_name,test_container_name,instance_id,run_t command="tail -f /dev/null", nano_cpus=None, platform="linux/x86_64", + log_config={ + "Type": "json-file", + "Config": { + "max-size": "10m", + "max-file": "3" + } + } ) diff --git a/app/agents/test_analysis_agent/test_analysis_agent.py b/app/agents/test_analysis_agent/test_analysis_agent.py index 8bbd0a269..4257fc465 100644 --- a/app/agents/test_analysis_agent/test_analysis_agent.py +++ b/app/agents/test_analysis_agent/test_analysis_agent.py @@ -24,8 +24,25 @@ import json from os.path import join as pjoin import traceback +from typing import Optional + MAX_LINE_NUM = 600 ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + +def sanitize_docker_image_name(name: str) -> str: + if not name or not str(name).strip(): + return "swe-task" + sanitized = re.sub(r'[^a-z0-9_-]', '-', str(name).lower().strip()) + if sanitized.startswith('-'): + sanitized = 'swe' + sanitized + # Remove trailing hyphens/underscores, then leading underscores + sanitized = sanitized.rstrip('-').rstrip('_').lstrip('_') + if not sanitized or sanitized == "swe": + return "swe-task" + if len(sanitized) > 50: + sanitized = sanitized[:50] + return sanitized + class TestAnalysisAgent(Agent): """ Agent responsible for: @@ -44,7 +61,9 @@ def __init__(self, task: Task, output_dir: str, repo_basic_info: str, client:doc self.run_test_num = 0 self.setup_dockerfile_num = 0 self.repo_basic_info = repo_basic_info - self.task_id = task.task_id.lower() + # Sanitize task_id to ensure valid Docker image names + raw_task_id = getattr(task, 'task_id', '') or '' + self.task_id = sanitize_docker_image_name(raw_task_id) self.client = client self.test_analysis_dir = os.path.join(self.output_dir, "test_analysis_agent") # self.build_image_dir = os.path.join(self.output_dir, "build_image") @@ -102,7 +121,7 @@ def get_test_log_with_line_numbers(self) -> str: full_formatted = [f"{i + 1:>{width}} {line}" for i, line in enumerate(lines)] if len(full_formatted) <= MAX_LINE_NUM: - return f'Test log:\n{"\n".join(full_formatted)}\n\n' + return f'Test log:\n{chr(10).join(full_formatted)}\n\n' head_size = MAX_LINE_NUM // 2 tail_size = MAX_LINE_NUM - head_size @@ -114,7 +133,7 @@ def get_test_log_with_line_numbers(self) -> str: omission = " " * width + " [..., {} lines omitted ...]".format( len(full_formatted) - head_size - tail_size) - truncated_log = "\n".join(head + [omission] + tail) + truncated_log = chr(10).join(head + [omission] + tail) return f'Test log (showing first {head_size} & last {tail_size} lines):\n{truncated_log}\n\n' @@ -298,6 +317,7 @@ def build_docker_image( dockerfile_path = f'{cur_build_image_dir}/Dockerfile' + logger.info(f"Writing Dockerfile to: {dockerfile_path}") with open(dockerfile_path, "w") as f: f.write(dockerfile) @@ -373,7 +393,7 @@ def setup_docker_and_run_test( cur_build_image_dir = self.get_latest_test_analysis_output_dir() os.makedirs(cur_build_image_dir, exist_ok=True) build_image_logger = setup_logger(self.task_id, Path(f'{cur_build_image_dir}/build_image.log')) - # image_name = f"{self.task_id}:latest_{self.setup_dockerfile_num}" + # Generate a valid Docker image name image_name = f"{self.task_id}-dockerfile{self.setup_dockerfile_num}:latest" try: @@ -429,9 +449,8 @@ def run_test(self, eval_script: str) -> (str, str, bool): cur_test_dir = self.get_latest_test_analysis_output_dir() os.makedirs(cur_test_dir, exist_ok=True) run_test_logger = setup_logger(self.task_id, Path(f'{cur_test_dir}/run_test.log')) - # test_image_name = f"{self.task_id}:latest_{self.setup_dockerfile_num}" + # Use sanitized image names to ensure valid Docker tags test_image_name = f"{self.task_id}-dockerfile{self.setup_dockerfile_num}:latest" - # test_container_name = f"{self.task_id}:test_{self.run_test_num}" test_container_name = f"{self.task_id}-test{self.run_test_num}" instance_id = self.task_id container = None @@ -487,9 +506,10 @@ def run_test(self, eval_script: str) -> (str, str, bool): run_test_logger.info(f"Git diff before:\n{git_diff_output_before}") eval_file = Path(f"{self.get_latest_test_analysis_output_dir()}/eval.sh") + logger.info(f"Writing eval.sh to: {eval_file}") eval_file.write_text(eval_script) run_test_logger.info( - f"Eval script for {instance_id} written to {patch_file}, now applying to container..." + f"Eval script for {instance_id} written to {eval_file}, now applying to container..." ) copy_to_container(container, eval_file, Path("/eval.sh")) diff --git a/app/globals.py b/app/globals.py index 868f3eef2..d98229759 100644 --- a/app/globals.py +++ b/app/globals.py @@ -5,6 +5,12 @@ # Overall output directory for results output_dir: str = "" +# Setup directory for repositories +setup_dir: str = "" + +# Results path for storing results +results_path: str | None = None + # upper bound of the number of conversation rounds for the agent conv_round_limit: int = 15 @@ -47,3 +53,11 @@ disable_context_retrieval: bool = False disable_run_test: bool = False + +# whether to only organize output without running tasks +organize_output_only: bool = False + +import os + +if results_path is None: + results_path = os.path.join(output_dir, "results") diff --git a/app/main.py b/app/main.py index 65a1554bd..20235cebc 100644 --- a/app/main.py +++ b/app/main.py @@ -77,12 +77,17 @@ def main(args, subparser_dest_attr_name: str = "command"): globals.output_dir = args.output_dir if globals.output_dir is not None: globals.output_dir = abspath(globals.output_dir) + else: + # Set a default output directory if none is provided + globals.output_dir = abspath("output/swe-factory-runs/default") num_processes: int = int(args.num_processes) # set whether brief or verbose log print_stdout: bool = not args.no_print log.print_stdout = print_stdout # model related common.set_model(args.model) + # Set environment variable for subprocesses (keep original model name for OpenAI-compatible endpoints) + os.environ["SWE_FACTORY_MODEL"] = args.model # FIXME: make temperature part of the Model class common.MODEL_TEMP = args.model_temperature # FIXME: we will remove these hyperparamters, which are from AutoCodeRover, thanks to this work. @@ -98,9 +103,18 @@ def main(args, subparser_dest_attr_name: str = "command"): globals.context_generation_limit = args.output_fix_limit globals.setup_dir = args.setup_dir + if globals.setup_dir is not None: + globals.setup_dir = abspath(globals.setup_dir) + else: + # Set a default setup directory if none is provided + globals.setup_dir = abspath("output/swe-factory-runs/testbed") globals.organize_output_only = args.organize_output_only - globals.results_path = args.results_path + # Set a default results_path if none is provided + if args.results_path is None: + globals.results_path = None # Let AgentsManager handle the default + else: + globals.results_path = args.results_path globals.disable_memory_pool = args.disable_memory_pool globals.disable_run_test = args.disable_run_test @@ -225,6 +239,17 @@ def set_local_parser_args(parser: ArgumentParser) -> None: "--local-repo", type=str, help="Path to a local copy of the target repo." ) parser.add_argument("--issue-file", type=str, help="Path to a local issue file.") + parser.add_argument( + "--setup-dir", + type=str, + help="The directory where repositories should be cloned to.", + ) + parser.add_argument( + "--results-path", + type=str, + default=None, + help="The directory where results should be saved.", + ) def add_task_related_args(parser: ArgumentParser) -> None: @@ -247,6 +272,9 @@ def model_parser(name: str): return name if name.startswith("litellm-generic-"): return name + # Allow direct model names (like google/gemini-2.5-flash) that contain "/" + if "/" in name: + return name raise TypeError(f"Invalid model name: {name}") parser.add_argument( @@ -429,7 +457,7 @@ def make_swe_tasks( setup_info = {} task_info = tasks_map[task_id] task_start_time_s = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - repo_cache_name = f'{task_info['repo']}_cache' + repo_cache_name = f"{task_info['repo']}_cache" repo_cache_dir = pjoin(setup_dir,repo_cache_name) if not os.path.isdir(repo_cache_dir): github_link = f"https://github.com/{task_info['repo']}.git" @@ -632,12 +660,25 @@ def run_raw_task( Returns: Whether the task completed successfully. """ + # Set the model in the subprocess to ensure SELECTED_MODEL is available + # We need to get the model name from the current process environment or use a default + model_name = os.getenv("SWE_FACTORY_MODEL", "gpt-3.5-turbo-0125") + common.set_model(model_name) + task_id = task.task_id start_time_s = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # task_output_dir = pjoin(globals.output_dir, f"{task_id}_{start_time_s}") + + # Ensure globals.output_dir is set + if not globals.output_dir: + globals.output_dir = abspath("output/swe-factory-runs/default") + task_output_dir = pjoin(globals.output_dir, f"{task_id}") + # Ensure task_output_dir is absolute + task_output_dir = os.path.abspath(task_output_dir) + status_file = pjoin(task_output_dir, "status.json") if os.path.exists(status_file): log.log_and_always_print(f"Status file already exists for task {task_id}, skipping execution") @@ -679,10 +720,34 @@ def do_inference( print_callback: Callable[[dict], None] | None = None, ) -> bool: client = docker.from_env() + + # Ensure task_output_dir is absolute + task_output_dir = os.path.abspath(task_output_dir) + apputils.create_dir_if_not_exists(task_output_dir) # github_link = f'https://github.com/{python_task.repo_name}.git' - commit_hash = python_task.commit - apputils.clone_repo_and_checkout(python_task.repo_cache_path,commit_hash,python_task.project_path) + # Handle both SweTask and PlainTask + if hasattr(python_task, 'commit'): + commit_hash = python_task.commit + elif hasattr(python_task, 'commit_hash'): + commit_hash = python_task.commit_hash + else: + raise AttributeError(f"Task object {type(python_task)} has no commit or commit_hash attribute") + + # For PlainTask, create a working directory in the testbed + if hasattr(python_task, 'repo_cache_path'): + repo_cache_path = python_task.repo_cache_path + else: + # Ensure globals.setup_dir is set + if not globals.setup_dir: + globals.setup_dir = abspath("output/swe-factory-runs/testbed") + + # Create a working directory in the testbed to avoid deleting the original repo + working_dir = pjoin(globals.setup_dir, f"{python_task.project_path.split('/')[-1]}_working") + repo_cache_path = python_task.project_path + python_task.project_path = working_dir + + apputils.clone_repo_and_checkout(repo_cache_path, commit_hash, python_task.project_path) logger.add( pjoin(task_output_dir, "info.log"), level="DEBUG", @@ -713,7 +778,7 @@ def do_inference( dump_cost(start_time, end_time, task_output_dir, python_task.project_path) finally: # python_task.reset_project() - python_task.remove_project() + # python_task.remove_project() # Commented out to prevent deleting original repository if client: client.close() @@ -723,6 +788,9 @@ def do_inference( def dump_cost( start_time: datetime, end_time: datetime, task_output_dir: str, project_path: str ): + # Ensure task_output_dir is absolute + task_output_dir = os.path.abspath(task_output_dir) + with apputils.cd(project_path): commit_hash = apputils.get_current_commit_hash() model_stats = common.SELECTED_MODEL.get_overall_exec_stats() diff --git a/app/model/common.py b/app/model/common.py index 76a698ba9..e8b8e483f 100644 --- a/app/model/common.py +++ b/app/model/common.py @@ -5,8 +5,8 @@ from typing import Literal import litellm -from litellm import cost_per_token -from litellm.utils import Choices, Message, ModelResponse +from litellm.cost_calculator import cost_per_token +from litellm.types.utils import Choices, Message, ModelResponse from openai import BadRequestError from tenacity import retry, stop_after_attempt, wait_random_exponential @@ -131,26 +131,48 @@ def call( ): # FIXME: ignore tools field since we don't use tools now try: + # Enable verbose logging for debugging - os is already imported at the top of the file + prefill_content = "{" if response_format == "json_object": # prefill messages.append({"role": "assistant", "content": prefill_content}) + max_tokens_val = os.getenv("ACR_TOKEN_LIMIT", "1024") + max_tokens_int = int(max_tokens_val) if max_tokens_val else 1024 + + # Check if we need to use a custom API base + api_base = os.getenv("LITELLM_API_BASE", None) + extra_kwargs = {} + if api_base: + print(f"Using custom API base: {api_base}") + print(f"Model name: {self.name}") + extra_kwargs["api_base"] = api_base + # Force OpenAI provider when using custom endpoint + extra_kwargs["custom_llm_provider"] = "openai" + # Set additional headers if needed + extra_kwargs["headers"] = {"Content-Type": "application/json"} + response = litellm.completion( model=self.name, messages=messages, temperature=MODEL_TEMP, - max_tokens=os.getenv("ACR_TOKEN_LIMIT", 1024), + max_tokens=max_tokens_int, response_format=( {"type": response_format} if "gpt" in self.name else None ), top_p=top_p, stream=False, + **extra_kwargs ) assert isinstance(response, ModelResponse) - resp_usage = response.usage - assert resp_usage is not None - input_tokens = int(resp_usage.prompt_tokens) - output_tokens = int(resp_usage.completion_tokens) + resp_usage = getattr(response, 'usage', None) + if resp_usage is None: + # Fallback if usage is not available + input_tokens = 0 + output_tokens = 0 + else: + input_tokens = int(resp_usage.prompt_tokens) + output_tokens = int(resp_usage.completion_tokens) cost = self.calc_cost(input_tokens, output_tokens) thread_cost.process_cost += cost @@ -193,9 +215,37 @@ def get_all_model_names(): def set_model(model_name: str): global SELECTED_MODEL if model_name not in MODEL_HUB and not model_name.startswith("litellm-generic-"): - print(f"Invalid model name: {model_name}") - sys.exit(1) - if model_name.startswith("litellm-generic-"): + # Handle direct model names (like google/gemini-2.5-flash) as OpenAI-compatible models + if "/" in model_name: # This looks like a direct model name + # Don't transform the model name - use it as-is for OpenAI-compatible endpoints + real_model_name = model_name + print(f"Using {model_name} as OpenAI-compatible model with custom endpoint") + + prompt_tokens = 5 + completion_tokens = 10 + try: + prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = ( + cost_per_token( + model=real_model_name, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + ) + except Exception as e: + # If cost calculation fails, use default values + print(f"Warning: Could not calculate costs for {model_name}, using defaults") + prompt_tokens_cost_usd_dollar = 0.00000015 # Default cost + completion_tokens_cost_usd_dollar = 0.0000006 # Default cost + + SELECTED_MODEL = LiteLLMGeneric( + real_model_name, + prompt_tokens_cost_usd_dollar, + completion_tokens_cost_usd_dollar, + ) + else: + print(f"Invalid model name: {model_name}") + sys.exit(1) + elif model_name.startswith("litellm-generic-"): real_model_name = model_name.removeprefix("litellm-generic-") prompt_tokens = 5 completion_tokens = 10 diff --git a/app/task.py b/app/task.py index ccc8385c1..f74785b67 100644 --- a/app/task.py +++ b/app/task.py @@ -107,11 +107,24 @@ class PlainTask(Task): commit_hash: str local_path: str problem_statement: str + patch: str = "" + test_patch: str = "" + repo_name: str = "" + version: str = "" + task_id: str = "" + + @property + def commit(self) -> str: + return self.commit_hash @property def project_path(self) -> str: return self.local_path + @project_path.setter + def project_path(self, value: str) -> None: + self.local_path = value + def setup_project(self) -> None: with apputils.cd(self.project_path): apputils.repo_reset_and_clean_checkout(self.commit_hash) diff --git a/debug_file_locations.py b/debug_file_locations.py new file mode 100644 index 000000000..a14f8b2e8 --- /dev/null +++ b/debug_file_locations.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Debug script to help identify where files are being written during execution. +This script will check for files in the current directory and compare with expected output directory. +""" + +import os +import sys +import time +from pathlib import Path +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def get_files_in_directory(directory): + """Get all files in a directory recursively.""" + files = [] + for root, dirs, filenames in os.walk(directory): + for filename in filenames: + file_path = os.path.join(root, filename) + files.append(file_path) + return files + +def monitor_file_creation(output_dir, check_interval=5, duration=300): + """ + Monitor file creation by periodically checking for new files. + + Args: + output_dir: The expected output directory + check_interval: How often to check for new files (in seconds) + duration: How long to monitor (in seconds) + """ + output_dir = os.path.abspath(output_dir) + current_dir = os.getcwd() + + logger.info(f"Starting file creation monitoring for {duration} seconds...") + logger.info(f"Expected output directory: {output_dir}") + logger.info(f"Current working directory: {current_dir}") + + # Get initial list of files + initial_files = set() + if os.path.exists(output_dir): + initial_files = set(get_files_in_directory(output_dir)) + + # Also check current directory for any files that might be created there + current_dir_files = set() + if os.path.exists(current_dir): + current_dir_files = set(get_files_in_directory(current_dir)) + + start_time = time.time() + check_count = 0 + + while time.time() - start_time < duration: + check_count += 1 + logger.info(f"Check #{check_count} - Time elapsed: {time.time() - start_time:.1f}s") + + # Check for new files in output directory + if os.path.exists(output_dir): + current_files = set(get_files_in_directory(output_dir)) + new_files = current_files - initial_files + if new_files: + logger.info(f"New files in output directory ({len(new_files)}):") + for file_path in sorted(new_files): + logger.info(f" + {file_path}") + initial_files = current_files + + # Check for new files in current directory + if os.path.exists(current_dir): + current_files = set(get_files_in_directory(current_dir)) + new_files = current_files - current_dir_files + if new_files: + logger.warning(f"WARNING: New files in current directory ({len(new_files)}):") + for file_path in sorted(new_files): + logger.warning(f" ! {file_path}") + if not file_path.startswith(output_dir): + logger.error(f"ERROR: File created outside expected output directory!") + logger.error(f" File: {file_path}") + logger.error(f" Expected: {output_dir}") + current_dir_files = current_files + + time.sleep(check_interval) + + logger.info("File creation monitoring completed.") + logger.info(f"Total checks performed: {check_count}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python debug_file_locations.py [duration_seconds] [check_interval_seconds]") + sys.exit(1) + + output_dir = sys.argv[1] + duration = int(sys.argv[2]) if len(sys.argv) > 2 else 300 + check_interval = int(sys.argv[3]) if len(sys.argv) > 3 else 5 + + monitor_file_creation(output_dir, check_interval, duration) \ No newline at end of file diff --git a/evaluation/docker_build.py b/evaluation/docker_build.py index 53db57a0c..15bda4666 100644 --- a/evaluation/docker_build.py +++ b/evaluation/docker_build.py @@ -603,6 +603,13 @@ def build_container( command="tail -f /dev/null", nano_cpus=nano_cpus, platform=test_spec.platform, + log_config={ + "Type": "json-file", + "Config": { + "max-size": "10m", + "max-file": "3" + } + } ) logger.info(f"Container for {test_spec.instance_id} created: {container.id}") @@ -659,6 +666,13 @@ def build_setup_container( command="tail -f /dev/null", # nano_cpus=nano_cpus, platform=test_spec.platform, + log_config={ + "Type": "json-file", + "Config": { + "max-size": "10m", + "max-file": "3" + } + } ) logger.info(f"Container for {test_spec.instance_id} created: {container.id}") diff --git a/requirements.txt b/requirements.txt index 183b61de0..0c9defb7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -127,7 +127,7 @@ tree-sitter-c==0.21.4 tree-sitter-cpp==0.22.2 tree-sitter-java==0.21.0 tree-sitter-languages==1.10.2 -triton==2.2.0 +# triton==2.2.0 types-jsonschema==4.21.0.20240311 typing_extensions tzdata==2025.2 diff --git a/test_results_path_fix.py b/test_results_path_fix.py new file mode 100644 index 000000000..507ea2fc7 --- /dev/null +++ b/test_results_path_fix.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Test script to verify that results.json files are no longer created in the root directory +when results_path is None. +""" + +import os +import sys +import tempfile +import shutil +from pathlib import Path + +# Add the app directory to the Python path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app')) + +from agents.agents_manager import AgentsManager +from task import Task +from datetime import datetime +import docker + +class MockTask(Task): + """Mock task for testing""" + def __init__(self): + self.repo_name = "test-repo" + self.commit = "test-commit" + self.version = "test-version" + self.test_patch = "test-patch" + self._project_path = "/tmp/test-project" + + @property + def project_path(self) -> str: + return self._project_path + + @project_path.setter + def project_path(self, value: str) -> None: + self._project_path = value + + def get_issue_statement(self) -> str: + return "Test issue statement" + + def setup_project(self) -> None: + pass + + def reset_project(self) -> None: + pass + +def test_results_path_fix(): + """Test that results.json is not created in root directory when results_path is None""" + + # Create a temporary directory for testing + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Testing in temporary directory: {temp_dir}") + + # Create a mock task + task = MockTask() + + # Create output directory + output_dir = os.path.join(temp_dir, "output") + os.makedirs(output_dir, exist_ok=True) + + # Test with results_path=None (the problematic case) + try: + # This should create results.json in output/results/ instead of root + agents_manager = AgentsManager( + task=task, + output_dir=output_dir, + client=docker.from_env(), + start_time=datetime.now(), + max_iteration_num=1, + results_path=None, # This was causing the issue + disable_memory_pool=False, + disable_context_retrieval=False, + disable_run_test=False + ) + + # Check if results.json was created in the expected location + expected_results_file = os.path.join(output_dir, "results", "results.json") + expected_lock_file = expected_results_file + ".lock" + + if os.path.exists(expected_results_file): + print(f"✅ SUCCESS: results.json created in expected location: {expected_results_file}") + else: + print(f"❌ FAILED: results.json not found in expected location: {expected_results_file}") + return False + + if os.path.exists(expected_lock_file): + print(f"✅ SUCCESS: results.json.lock created in expected location: {expected_lock_file}") + else: + print(f"❌ FAILED: results.json.lock not found in expected location: {expected_lock_file}") + return False + + # Check that files are NOT created in the current working directory + root_results_file = "results.json" + root_lock_file = "results.json.lock" + + if not os.path.exists(root_results_file): + print(f"✅ SUCCESS: results.json NOT created in root directory") + else: + print(f"❌ FAILED: results.json still created in root directory") + return False + + if not os.path.exists(root_lock_file): + print(f"✅ SUCCESS: results.json.lock NOT created in root directory") + else: + print(f"❌ FAILED: results.json.lock still created in root directory") + return False + + print("🎉 All tests passed! The fix is working correctly.") + return True + + except Exception as e: + print(f"❌ ERROR: Test failed with exception: {e}") + return False + +if __name__ == "__main__": + success = test_results_path_fix() + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/tests/test_docker_name_fix.py b/tests/test_docker_name_fix.py new file mode 100644 index 000000000..d2e5a3b24 --- /dev/null +++ b/tests/test_docker_name_fix.py @@ -0,0 +1,44 @@ +import re +import pytest + +def sanitize_docker_image_name(name: str) -> str: + if not name or not str(name).strip(): + return "swe-task" + sanitized = re.sub(r'[^a-z0-9_-]', '-', str(name).lower().strip()) + if sanitized.startswith('-'): + sanitized = 'swe' + sanitized + sanitized = sanitized.rstrip('-').rstrip('_').lstrip('_') + if not sanitized or sanitized == "swe": + return "swe-task" + if len(sanitized) > 50: + sanitized = sanitized[:50] + return sanitized + +def test_sanitize_docker_image_name(): + test_cases = [ + ("", "swe-task"), + (None, "swe-task"), + (" ", "swe-task"), + ("kareldb-connection-1", "kareldb-connection-1"), + ("My Project", "my-project"), + ("test@example.com", "test-example-com"), + ("-dockerfile1", "swe-dockerfile1"), + ("-test", "swe-test"), + ("---", "swe-task"), + ("project@#$%^&*()", "project"), + ("CamelCase", "camelcase"), + ("snake_case", "snake_case"), + ("kebab-case", "kebab-case"), + ("a" * 100, "a" * 50), + ("___test___", "test"), + ] + for input_name, expected in test_cases: + result = sanitize_docker_image_name(input_name) + assert result == expected, f"Input: {input_name!r} -> {result!r} (expected: {expected!r})" + +def test_image_name_generation(): + problematic_task_id = "" + sanitized_task_id = sanitize_docker_image_name(problematic_task_id) + setup_dockerfile_num = 1 + image_name = f"{sanitized_task_id}-dockerfile{setup_dockerfile_num}:latest" + assert not image_name.startswith('-'), f"Image name should not start with hyphen: {image_name}" \ No newline at end of file