From 6c68b798295d9a0d3c2fcd46c22af8da8e9d130c Mon Sep 17 00:00:00 2001
From: Hec <me@xynova.com>
Date: Mon, 7 Jul 2025 22:10:19 +1000
Subject: [PATCH] make it work

---
 .gitignore                                    |   6 +-
 DOCKER_LOGGING_FIX.md                         | 102 ++++++++++++++
 FILE_LOCATION_DEBUG.md                        | 129 ++++++++++++++++++
 app/agents/agents_manager.py                  |  28 +++-
 .../test_analysis_agent/docker_utils.py       |   7 +
 .../test_analysis_agent.py                    |  34 ++++-
 app/globals.py                                |  14 ++
 app/main.py                                   |  78 ++++++++++-
 app/model/common.py                           |  70 ++++++++--
 app/task.py                                   |  13 ++
 debug_file_locations.py                       |  97 +++++++++++++
 evaluation/docker_build.py                    |  14 ++
 requirements.txt                              |   2 +-
 test_results_path_fix.py                      | 117 ++++++++++++++++
 tests/test_docker_name_fix.py                 |  44 ++++++
 15 files changed, 726 insertions(+), 29 deletions(-)
 create mode 100644 DOCKER_LOGGING_FIX.md
 create mode 100644 FILE_LOCATION_DEBUG.md
 create mode 100644 debug_file_locations.py
 create mode 100644 test_results_path_fix.py
 create mode 100644 tests/test_docker_name_fix.py
diff --git a/.gitignore b/.gitignore
index f0011289f..e921ca410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,9 +162,13 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+input/
 evaluation/reports/
 output/
 # *data_collection/collect/*.sh
 *data_collection/collect/temp
 *evaluation/temp
-temp/
\ No newline at end of file
+temp/
+
+
+ignore.*
\ No newline at end of file
diff --git a/DOCKER_LOGGING_FIX.md b/DOCKER_LOGGING_FIX.md
new file mode 100644
index 000000000..45c5f4746
--- /dev/null
+++ b/DOCKER_LOGGING_FIX.md
@@ -0,0 +1,102 @@
+# Docker Logging Configuration Fix
+
+## Issue Description
+
+The Docker containers created by the SWE Factory tool were using a logging configuration that disabled container logs:
+
+```json
+"ContainerIDFile": "",
+"LogConfig": {
+    "Type": "none",
+    "Config": {}
+}
+```
+
+This configuration meant:
+- No container logs were being captured
+- `docker logs <container>` would not work
+- Debugging container issues was difficult
+- The tool couldn't capture container output for analysis
+
+## Root Cause
+
+The issue was in the container creation code in two files:
+1. `app/agents/test_analysis_agent/docker_utils.py` - `build_container()` function
+2. `evaluation/docker_build.py` - `build_container()` and `build_setup_container()` functions
+
+These functions were creating containers without explicit logging configuration, causing Docker to use default settings that might be set to `"none"` in some environments.
+
+## Solution Applied
+
+I've updated all container creation calls to include proper logging configuration:
+
+### Before:
+```python
+container = client.containers.create(
+    image=test_image_name,
+    name=test_container_name,
+    user="root",
+    detach=True,
+    command="tail -f /dev/null",
+    nano_cpus=None,
+    platform="linux/x86_64",
+)
+```
+
+### After:
+```python
+container = client.containers.create(
+    image=test_image_name,
+    name=test_container_name,
+    user="root",
+    detach=True,
+    command="tail -f /dev/null",
+    nano_cpus=None,
+    platform="linux/x86_64",
+    log_config={
+        "Type": "json-file",
+        "Config": {
+            "max-size": "10m",
+            "max-file": "3"
+        }
+    }
+)
+```
+
+## Benefits of the Fix
+
+1. **Container Logs Available**: You can now use `docker logs <container>` to view container output
+2. **Better Debugging**: Container issues can be diagnosed more easily
+3. **Log Rotation**: Logs are automatically rotated when they reach 10MB
+4. **Storage Management**: Only 3 log files are kept per container
+5. **Tool Functionality**: The SWE Factory tool can now capture and analyze container output
+
+## Files Modified
+
+1. `app/agents/test_analysis_agent/docker_utils.py` - Line 330-340
+2. `evaluation/docker_build.py` - Lines 590-600 and 650-660
+
+## Testing the Fix
+
+After applying this fix, you should be able to:
+
+1. Run the SWE Factory tool as usual
+2. Use `docker logs <container_name>` to view container logs
+3. See container output in the tool's log files
+4. Debug container issues more effectively
+
+## Recommended Docker Configuration
+
+For optimal performance, ensure your Docker daemon is configured with:
+
+```json
+{
+  "log-driver": "json-file",
+  "log-opts": {
+    "max-size": "10m",
+    "max-file": "3"
+  }
+}
+```
+
+This ensures consistent logging behavior across all containers, even those created without explicit logging configuration. 
\ No newline at end of file
diff --git a/FILE_LOCATION_DEBUG.md b/FILE_LOCATION_DEBUG.md
new file mode 100644
index 000000000..63aff81d4
--- /dev/null
+++ b/FILE_LOCATION_DEBUG.md
@@ -0,0 +1,129 @@
+# File Location Debug Guide
+
+## Issue Description
+
+When running the SWE Factory tool, files are being created at the repository root instead of in the specified output directories, despite providing the correct `--output-dir`, `--setup-dir`, and `--results-path` parameters.
+
+## Root Cause Analysis
+
+After analyzing the codebase, I found that the system is correctly designed to use the specified output directories. All file writing operations use proper path construction with `pjoin()` or `os.path.join()` to ensure files are written to the correct output directories.
+
+However, there are several potential causes for files appearing in the wrong location:
+
+### 1. Working Directory Changes
+The code uses `cd` context managers in several places (like in `dump_cost` function), which temporarily change the working directory. If any file operations happen outside of these context managers while the directory is changed, they might write to the wrong location.
+
+### 2. Race Conditions
+The system uses multiprocessing, and there might be race conditions where the working directory is changed in one process while another process is writing files.
+
+### 3. Missing Absolute Paths
+Some file operations might not be using absolute paths, causing them to write relative to the current working directory.
+
+## Changes Made
+
+I've made the following improvements to ensure files are written to the correct locations:
+
+### 1. Added Absolute Path Safety Checks
+- Modified `run_raw_task()` to ensure `task_output_dir` is absolute
+- Modified `do_inference()` to ensure `task_output_dir` is absolute  
+- Modified `dump_cost()` to ensure `task_output_dir` is absolute
+
+### 2. Added Debug Logging
+- Added logging in `AgentsManager` to track where Dockerfile, eval.sh, and status.json are written
+- Added logging in `TestAnalysisAgent` to track where Dockerfile and eval.sh are written
+
+### 3. Created Debug Script
+- Created `debug_file_locations.py` to monitor file creation during execution
+
+## How to Debug the Issue
+
+### Step 1: Run with Debug Logging
+The enhanced logging will now show exactly where files are being written. Look for log messages like:
+```
+Writing Dockerfile to: /path/to/output/dir/Dockerfile
+Writing eval.sh to: /path/to/output/dir/eval.sh
+Writing status.json to: /path/to/output/dir/status.json
+```
+
+### Step 2: Use the Debug Script
+Run the debug script in a separate terminal to monitor file creation:
+
+```bash
+# In one terminal, start the debug script
+python debug_file_locations.py output/swe-factory-runs/kareldb-test 600 10
+
+# In another terminal, run your command
+LITELLM_API_BASE="https://api.dev.halo.engineer/v1/ai" \
+OPENAI_API_KEY="${OPENAI_API_KEY?->Need a key}" \
+PYTHONPATH=. python app/main.py local-issue \
+    --task-id "kareldb-connection-1" \
+    --local-repo "/Users/hector.maldonado@clearroute.io/xynova/kareldb-cp" \
+    --issue-file "input/kareldb_test_issue.txt" \
+    --model google/gemini-2.5-flash \
+    --output-dir "output/swe-factory-runs/kareldb-test" \
+    --setup-dir "output/swe-factory-runs/testbed" \
+    --results-path "output/swe-factory-runs/results" \
+    --conv-round-limit 3 \
+    --num-processes 1 \
+    --model-temperature 0.2
+```
+
+The debug script will:
+- Monitor file creation every 10 seconds for 10 minutes
+- Log all new files created
+- Warn about files created outside the expected output directory
+- Show the current working directory at each check
+
+### Step 3: Check the Logs
+Look for:
+1. **Expected behavior**: Files being written to the specified output directory
+2. **Unexpected behavior**: Files being written to the current working directory or repository root
+3. **Working directory changes**: Any unexpected changes in the current working directory
+
+## Expected File Locations
+
+Based on your command, files should be created in:
+
+- **Task output files**: `output/swe-factory-runs/kareldb-test/kareldb-connection-1/`
+  - `Dockerfile`
+  - `eval.sh`
+  - `status.json`
+  - `cost.json`
+  - `meta.json`
+  - `problem_statement.txt`
+  - `developer_patch.diff`
+  - `info.log`
+  - `test_analysis_agent_0/` (subdirectory with test results)
+
+- **Setup directory**: `output/swe-factory-runs/testbed/`
+  - Repository clones and working directories
+
+- **Results**: `output/swe-factory-runs/results/results.json`
+  - Aggregated results from all tasks
+
+## Troubleshooting
+
+If files are still being created in the wrong location:
+
+1. **Check the debug logs** to see exactly where files are being written
+2. **Verify the output directory exists** and is writable
+3. **Check for any error messages** about directory creation or file writing
+4. **Ensure no other processes** are changing the working directory
+5. **Verify the command line arguments** are being parsed correctly
+
+## Additional Recommendations
+
+1. **Use absolute paths** in your command line arguments
+2. **Ensure the output directories exist** before running the command
+3. **Check file permissions** on the output directories
+4. **Monitor system resources** to ensure there are no disk space issues
+
+## Code Changes Summary
+
+The following files were modified to improve file location handling:
+
+- `app/main.py`: Added absolute path safety checks
+- `app/agents/agents_manager.py`: Added debug logging for file creation
+- `app/agents/test_analysis_agent/test_analysis_agent.py`: Added debug logging for file creation
+- `debug_file_locations.py`: Created debug script for monitoring file creation
+- `FILE_LOCATION_DEBUG.md`: This documentation file 
\ No newline at end of file
diff --git a/app/agents/agents_manager.py b/app/agents/agents_manager.py
index c86e060a7..6789e9fa5 100644
--- a/app/agents/agents_manager.py
+++ b/app/agents/agents_manager.py
@@ -49,7 +49,7 @@ def __init__(self,
                 client: docker.DockerClient, 
                 start_time: datetime, 
                 max_iteration_num: int,
-                results_path:str,
+                results_path: str | None,
                 disable_memory_pool:bool,
                 disable_context_retrieval:bool,
                 disable_run_test:bool,
@@ -79,7 +79,19 @@ def __init__(self,
             self.set_agent_status("context_retrieval_agent",True)
         self.agents_dict['test_analysis_agent'].disable_context_retrieval= disable_context_retrieval
         self.agents_dict['test_analysis_agent'].disable_run_test = disable_run_test
-        self.results_file = f'{results_path}/results.json'
+        
+        # Handle None results_path by setting a default
+        if results_path is None:
+            results_path = os.path.join(output_dir, "results")
+        
+        # Ensure results_path is absolute
+        if not os.path.isabs(results_path):
+            results_path = os.path.abspath(results_path)
+        
+        # Create the results directory if it doesn't exist
+        os.makedirs(results_path, exist_ok=True)
+        
+        self.results_file = os.path.join(results_path, 'results.json')
         lock_path = self.results_file + '.lock'
         self.lock = FileLock(lock_path, timeout=30)
         with self.lock:
@@ -263,15 +275,21 @@ def run_workflow(self) -> None:
         eval_script_content = self.agents_dict['write_eval_script_agent'].get_latest_eval_script()
         eval_script_skeleton_content = self.agents_dict['write_eval_script_agent'].get_latest_eval_script_skeleton()
         if dockerfile_content and eval_script_content:
-            with open(os.path.join(self.output_dir, "Dockerfile"), "w") as dockerfile_f:
+            dockerfile_path = os.path.join(self.output_dir, "Dockerfile")
+            logger.info(f"Writing Dockerfile to: {dockerfile_path}")
+            with open(dockerfile_path, "w") as dockerfile_f:
                 dockerfile_f.write(dockerfile_content)
 
         
-            with open(os.path.join(self.output_dir, "eval.sh"), "w") as eval_script_f:
+            eval_script_path = os.path.join(self.output_dir, "eval.sh")
+            logger.info(f"Writing eval.sh to: {eval_script_path}")
+            with open(eval_script_path, "w") as eval_script_f:
                 eval_script_f.write(eval_script_content)
 
 
-        with open(os.path.join(self.output_dir, "status.json"), "w") as status_file_f:
+        status_file_path = os.path.join(self.output_dir, "status.json")
+        logger.info(f"Writing status.json to: {status_file_path}")
+        with open(status_file_path, "w") as status_file_f:
                 json.dump({"is_finish": self.workflow_finish_status}, status_file_f)
 
         if self.workflow_finish_status:
diff --git a/app/agents/test_analysis_agent/docker_utils.py b/app/agents/test_analysis_agent/docker_utils.py
index 8a6a8e04c..f216d1927 100644
--- a/app/agents/test_analysis_agent/docker_utils.py
+++ b/app/agents/test_analysis_agent/docker_utils.py
@@ -339,6 +339,13 @@ def build_container(client,test_image_name,test_container_name,instance_id,run_t
                 command="tail -f /dev/null",
                 nano_cpus=None,
                 platform="linux/x86_64",
+                log_config={
+                    "Type": "json-file",
+                    "Config": {
+                        "max-size": "10m",
+                        "max-file": "3"
+                    }
+                }
             )
 
            
diff --git a/app/agents/test_analysis_agent/test_analysis_agent.py b/app/agents/test_analysis_agent/test_analysis_agent.py
index 8bbd0a269..4257fc465 100644
--- a/app/agents/test_analysis_agent/test_analysis_agent.py
+++ b/app/agents/test_analysis_agent/test_analysis_agent.py
@@ -24,8 +24,25 @@
 import json
 from os.path import join as pjoin
 import traceback
+from typing import Optional
+
 MAX_LINE_NUM = 600
 ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+
+def sanitize_docker_image_name(name: str) -> str:
+    if not name or not str(name).strip():
+        return "swe-task"
+    sanitized = re.sub(r'[^a-z0-9_-]', '-', str(name).lower().strip())
+    if sanitized.startswith('-'):
+        sanitized = 'swe' + sanitized
+    # Remove trailing hyphens/underscores, then leading underscores
+    sanitized = sanitized.rstrip('-').rstrip('_').lstrip('_')
+    if not sanitized or sanitized == "swe":
+        return "swe-task"
+    if len(sanitized) > 50:
+        sanitized = sanitized[:50]
+    return sanitized
+
 class TestAnalysisAgent(Agent):
     """
     Agent responsible for:
@@ -44,7 +61,9 @@ def __init__(self, task: Task, output_dir: str, repo_basic_info: str, client:doc
         self.run_test_num = 0
         self.setup_dockerfile_num = 0
         self.repo_basic_info = repo_basic_info
-        self.task_id = task.task_id.lower()
+        # Sanitize task_id to ensure valid Docker image names
+        raw_task_id = getattr(task, 'task_id', '') or ''
+        self.task_id = sanitize_docker_image_name(raw_task_id)
         self.client = client
         self.test_analysis_dir = os.path.join(self.output_dir, "test_analysis_agent") 
         # self.build_image_dir = os.path.join(self.output_dir, "build_image") 
@@ -102,7 +121,7 @@ def get_test_log_with_line_numbers(self) -> str:
         full_formatted = [f"{i + 1:>{width}}   {line}" for i, line in enumerate(lines)]
         
         if len(full_formatted) <= MAX_LINE_NUM:
-            return f'Test log:\n{"\n".join(full_formatted)}\n\n'
+            return f'Test log:\n{chr(10).join(full_formatted)}\n\n'
         
         head_size = MAX_LINE_NUM // 2
         tail_size = MAX_LINE_NUM - head_size
@@ -114,7 +133,7 @@ def get_test_log_with_line_numbers(self) -> str:
         omission = " " * width + "   [..., {} lines omitted ...]".format(
             len(full_formatted) - head_size - tail_size)
         
-        truncated_log = "\n".join(head + [omission] + tail)
+        truncated_log = chr(10).join(head + [omission] + tail)
         
         return f'Test log (showing first {head_size} & last {tail_size} lines):\n{truncated_log}\n\n'
 
@@ -298,6 +317,7 @@ def build_docker_image(
         
 
         dockerfile_path = f'{cur_build_image_dir}/Dockerfile'
+        logger.info(f"Writing Dockerfile to: {dockerfile_path}")
         with open(dockerfile_path, "w") as f:
             f.write(dockerfile)
 
@@ -373,7 +393,7 @@ def setup_docker_and_run_test(
         cur_build_image_dir = self.get_latest_test_analysis_output_dir()
         os.makedirs(cur_build_image_dir, exist_ok=True)
         build_image_logger = setup_logger(self.task_id, Path(f'{cur_build_image_dir}/build_image.log'))
-        # image_name = f"{self.task_id}:latest_{self.setup_dockerfile_num}"
+        # Generate a valid Docker image name
         image_name = f"{self.task_id}-dockerfile{self.setup_dockerfile_num}:latest"
        
         try:
@@ -429,9 +449,8 @@ def run_test(self, eval_script: str) -> (str, str, bool):
         cur_test_dir = self.get_latest_test_analysis_output_dir()
         os.makedirs(cur_test_dir, exist_ok=True)
         run_test_logger = setup_logger(self.task_id, Path(f'{cur_test_dir}/run_test.log'))
-        # test_image_name = f"{self.task_id}:latest_{self.setup_dockerfile_num}"
+        # Use sanitized image names to ensure valid Docker tags
         test_image_name = f"{self.task_id}-dockerfile{self.setup_dockerfile_num}:latest"
-        # test_container_name =  f"{self.task_id}:test_{self.run_test_num}"
         test_container_name = f"{self.task_id}-test{self.run_test_num}"
         instance_id = self.task_id
         container = None
@@ -487,9 +506,10 @@ def run_test(self, eval_script: str) -> (str, str, bool):
             run_test_logger.info(f"Git diff before:\n{git_diff_output_before}")
 
             eval_file = Path(f"{self.get_latest_test_analysis_output_dir()}/eval.sh")
+            logger.info(f"Writing eval.sh to: {eval_file}")
             eval_file.write_text(eval_script)
             run_test_logger.info(
-                f"Eval script for {instance_id} written to {patch_file}, now applying to container..."
+                f"Eval script for {instance_id} written to {eval_file}, now applying to container..."
             )
             copy_to_container(container, eval_file, Path("/eval.sh"))
 
diff --git a/app/globals.py b/app/globals.py
index 868f3eef2..d98229759 100644
--- a/app/globals.py
+++ b/app/globals.py
@@ -5,6 +5,12 @@
 # Overall output directory for results
 output_dir: str = ""
 
+# Setup directory for repositories
+setup_dir: str = ""
+
+# Results path for storing results
+results_path: str | None = None
+
 # upper bound of the number of conversation rounds for the agent
 conv_round_limit: int = 15
 
@@ -47,3 +53,11 @@
 disable_context_retrieval: bool = False
 
 disable_run_test: bool = False
+
+# whether to only organize output without running tasks
+organize_output_only: bool = False
+
+import os
+
+if results_path is None:
+    results_path = os.path.join(output_dir, "results")
diff --git a/app/main.py b/app/main.py
index 65a1554bd..20235cebc 100644
--- a/app/main.py
+++ b/app/main.py
@@ -77,12 +77,17 @@ def main(args, subparser_dest_attr_name: str = "command"):
     globals.output_dir = args.output_dir
     if globals.output_dir is not None:
         globals.output_dir = abspath(globals.output_dir)
+    else:
+        # Set a default output directory if none is provided
+        globals.output_dir = abspath("output/swe-factory-runs/default")
     num_processes: int = int(args.num_processes)
     # set whether brief or verbose log
     print_stdout: bool = not args.no_print
     log.print_stdout = print_stdout
     # model related
     common.set_model(args.model)
+    # Set environment variable for subprocesses (keep original model name for OpenAI-compatible endpoints)
+    os.environ["SWE_FACTORY_MODEL"] = args.model
     # FIXME: make temperature part of the Model class
     common.MODEL_TEMP = args.model_temperature
     # FIXME: we will remove these hyperparamters, which are from AutoCodeRover, thanks to this work.
@@ -98,9 +103,18 @@ def main(args, subparser_dest_attr_name: str = "command"):
 
     globals.context_generation_limit = args.output_fix_limit
     globals.setup_dir = args.setup_dir 
+    if globals.setup_dir is not None:
+        globals.setup_dir = abspath(globals.setup_dir)
+    else:
+        # Set a default setup directory if none is provided
+        globals.setup_dir = abspath("output/swe-factory-runs/testbed")
     
     globals.organize_output_only = args.organize_output_only
-    globals.results_path = args.results_path 
+    # Set a default results_path if none is provided
+    if args.results_path is None:
+        globals.results_path = None  # Let AgentsManager handle the default
+    else:
+        globals.results_path = args.results_path
     globals.disable_memory_pool = args.disable_memory_pool
     globals.disable_run_test = args.disable_run_test
     
@@ -225,6 +239,17 @@ def set_local_parser_args(parser: ArgumentParser) -> None:
         "--local-repo", type=str, help="Path to a local copy of the target repo."
     )
     parser.add_argument("--issue-file", type=str, help="Path to a local issue file.")
+    parser.add_argument(
+        "--setup-dir",
+        type=str,
+        help="The directory where repositories should be cloned to.",
+    )
+    parser.add_argument(
+        "--results-path",
+        type=str,
+        default=None,
+        help="The directory where results should be saved.",
+    )
 
 
 def add_task_related_args(parser: ArgumentParser) -> None:
@@ -247,6 +272,9 @@ def model_parser(name: str):
             return name
         if name.startswith("litellm-generic-"):
             return name
+        # Allow direct model names (like google/gemini-2.5-flash) that contain "/"
+        if "/" in name:
+            return name
         raise TypeError(f"Invalid model name: {name}")
 
     parser.add_argument(
@@ -429,7 +457,7 @@ def make_swe_tasks(
         setup_info = {}
         task_info = tasks_map[task_id]
         task_start_time_s = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        repo_cache_name = f'{task_info['repo']}_cache'
+        repo_cache_name = f"{task_info['repo']}_cache"
         repo_cache_dir =  pjoin(setup_dir,repo_cache_name)
         if not os.path.isdir(repo_cache_dir):
             github_link = f"https://github.com/{task_info['repo']}.git"
@@ -632,12 +660,25 @@ def run_raw_task(
     Returns:
         Whether the task completed successfully.
     """
+    # Set the model in the subprocess to ensure SELECTED_MODEL is available
+    # We need to get the model name from the current process environment or use a default
+    model_name = os.getenv("SWE_FACTORY_MODEL", "gpt-3.5-turbo-0125")
+    common.set_model(model_name)
+    
     task_id = task.task_id
 
     start_time_s = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     # task_output_dir = pjoin(globals.output_dir, f"{task_id}_{start_time_s}")
+    
+    # Ensure globals.output_dir is set
+    if not globals.output_dir:
+        globals.output_dir = abspath("output/swe-factory-runs/default")
+    
     task_output_dir = pjoin(globals.output_dir, f"{task_id}")
     
+    # Ensure task_output_dir is absolute
+    task_output_dir = os.path.abspath(task_output_dir)
+    
     status_file = pjoin(task_output_dir, "status.json")
     if os.path.exists(status_file):
         log.log_and_always_print(f"Status file already exists for task {task_id}, skipping execution")
@@ -679,10 +720,34 @@ def do_inference(
     print_callback: Callable[[dict], None] | None = None,
 ) -> bool:
     client = docker.from_env()
+    
+    # Ensure task_output_dir is absolute
+    task_output_dir = os.path.abspath(task_output_dir)
+    
     apputils.create_dir_if_not_exists(task_output_dir)
     # github_link = f'https://github.com/{python_task.repo_name}.git'
-    commit_hash = python_task.commit
-    apputils.clone_repo_and_checkout(python_task.repo_cache_path,commit_hash,python_task.project_path)
+    # Handle both SweTask and PlainTask
+    if hasattr(python_task, 'commit'):
+        commit_hash = python_task.commit
+    elif hasattr(python_task, 'commit_hash'):
+        commit_hash = python_task.commit_hash
+    else:
+        raise AttributeError(f"Task object {type(python_task)} has no commit or commit_hash attribute")
+    
+    # For PlainTask, create a working directory in the testbed
+    if hasattr(python_task, 'repo_cache_path'):
+        repo_cache_path = python_task.repo_cache_path
+    else:
+        # Ensure globals.setup_dir is set
+        if not globals.setup_dir:
+            globals.setup_dir = abspath("output/swe-factory-runs/testbed")
+        
+        # Create a working directory in the testbed to avoid deleting the original repo
+        working_dir = pjoin(globals.setup_dir, f"{python_task.project_path.split('/')[-1]}_working")
+        repo_cache_path = python_task.project_path
+        python_task.project_path = working_dir
+    
+    apputils.clone_repo_and_checkout(repo_cache_path, commit_hash, python_task.project_path)
     logger.add(
         pjoin(task_output_dir, "info.log"),
         level="DEBUG",
@@ -713,7 +778,7 @@ def do_inference(
         dump_cost(start_time, end_time, task_output_dir, python_task.project_path)
     finally:
         # python_task.reset_project()
-        python_task.remove_project()
+        # python_task.remove_project()  # Commented out to prevent deleting original repository
         if client:
             client.close()
 
@@ -723,6 +788,9 @@ def do_inference(
 def dump_cost(
     start_time: datetime, end_time: datetime, task_output_dir: str, project_path: str
 ):
+    # Ensure task_output_dir is absolute
+    task_output_dir = os.path.abspath(task_output_dir)
+    
     with apputils.cd(project_path):
         commit_hash = apputils.get_current_commit_hash()
     model_stats = common.SELECTED_MODEL.get_overall_exec_stats()
diff --git a/app/model/common.py b/app/model/common.py
index 76a698ba9..e8b8e483f 100644
--- a/app/model/common.py
+++ b/app/model/common.py
@@ -5,8 +5,8 @@
 from typing import Literal
 
 import litellm
-from litellm import cost_per_token
-from litellm.utils import Choices, Message, ModelResponse
+from litellm.cost_calculator import cost_per_token
+from litellm.types.utils import Choices, Message, ModelResponse
 from openai import BadRequestError
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 
@@ -131,26 +131,48 @@ def call(
     ):
         # FIXME: ignore tools field since we don't use tools now
         try:
+            # Enable verbose logging for debugging - os is already imported at the top of the file
+                
             prefill_content = "{"
             if response_format == "json_object":  # prefill
                 messages.append({"role": "assistant", "content": prefill_content})
 
+            max_tokens_val = os.getenv("ACR_TOKEN_LIMIT", "1024")
+            max_tokens_int = int(max_tokens_val) if max_tokens_val else 1024
+            
+            # Check if we need to use a custom API base
+            api_base = os.getenv("LITELLM_API_BASE", None)
+            extra_kwargs = {}
+            if api_base:
+                print(f"Using custom API base: {api_base}")
+                print(f"Model name: {self.name}")
+                extra_kwargs["api_base"] = api_base
+                # Force OpenAI provider when using custom endpoint
+                extra_kwargs["custom_llm_provider"] = "openai"
+                # Set additional headers if needed
+                extra_kwargs["headers"] = {"Content-Type": "application/json"}
+            
             response = litellm.completion(
                 model=self.name,
                 messages=messages,
                 temperature=MODEL_TEMP,
-                max_tokens=os.getenv("ACR_TOKEN_LIMIT", 1024),
+                max_tokens=max_tokens_int,
                 response_format=(
                     {"type": response_format} if "gpt" in self.name else None
                 ),
                 top_p=top_p,
                 stream=False,
+                **extra_kwargs
             )
             assert isinstance(response, ModelResponse)
-            resp_usage = response.usage
-            assert resp_usage is not None
-            input_tokens = int(resp_usage.prompt_tokens)
-            output_tokens = int(resp_usage.completion_tokens)
+            resp_usage = getattr(response, 'usage', None)
+            if resp_usage is None:
+                # Fallback if usage is not available
+                input_tokens = 0
+                output_tokens = 0
+            else:
+                input_tokens = int(resp_usage.prompt_tokens)
+                output_tokens = int(resp_usage.completion_tokens)
             cost = self.calc_cost(input_tokens, output_tokens)
 
             thread_cost.process_cost += cost
@@ -193,9 +215,37 @@ def get_all_model_names():
 def set_model(model_name: str):
     global SELECTED_MODEL
     if model_name not in MODEL_HUB and not model_name.startswith("litellm-generic-"):
-        print(f"Invalid model name: {model_name}")
-        sys.exit(1)
-    if model_name.startswith("litellm-generic-"):
+        # Handle direct model names (like google/gemini-2.5-flash) as OpenAI-compatible models
+        if "/" in model_name:  # This looks like a direct model name
+            # Don't transform the model name - use it as-is for OpenAI-compatible endpoints
+            real_model_name = model_name
+            print(f"Using {model_name} as OpenAI-compatible model with custom endpoint")
+                
+            prompt_tokens = 5
+            completion_tokens = 10
+            try:
+                prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = (
+                    cost_per_token(
+                        model=real_model_name,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                    )
+                )
+            except Exception as e:
+                # If cost calculation fails, use default values
+                print(f"Warning: Could not calculate costs for {model_name}, using defaults")
+                prompt_tokens_cost_usd_dollar = 0.00000015  # Default cost
+                completion_tokens_cost_usd_dollar = 0.0000006  # Default cost
+            
+            SELECTED_MODEL = LiteLLMGeneric(
+                real_model_name,
+                prompt_tokens_cost_usd_dollar,
+                completion_tokens_cost_usd_dollar,
+            )
+        else:
+            print(f"Invalid model name: {model_name}")
+            sys.exit(1)
+    elif model_name.startswith("litellm-generic-"):
         real_model_name = model_name.removeprefix("litellm-generic-")
         prompt_tokens = 5
         completion_tokens = 10
diff --git a/app/task.py b/app/task.py
index ccc8385c1..f74785b67 100644
--- a/app/task.py
+++ b/app/task.py
@@ -107,11 +107,24 @@ class PlainTask(Task):
     commit_hash: str
     local_path: str
     problem_statement: str
+    patch: str = ""
+    test_patch: str = ""
+    repo_name: str = ""
+    version: str = ""
+    task_id: str = ""
+
+    @property
+    def commit(self) -> str:
+        return self.commit_hash
 
     @property
     def project_path(self) -> str:
         return self.local_path
 
+    @project_path.setter
+    def project_path(self, value: str) -> None:
+        self.local_path = value
+
     def setup_project(self) -> None:
         with apputils.cd(self.project_path):
             apputils.repo_reset_and_clean_checkout(self.commit_hash)
diff --git a/debug_file_locations.py b/debug_file_locations.py
new file mode 100644
index 000000000..a14f8b2e8
--- /dev/null
+++ b/debug_file_locations.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Debug script to help identify where files are being written during execution.
+This script will check for files in the current directory and compare with expected output directory.
+"""
+
+import os
+import sys
+import time
+from pathlib import Path
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+def get_files_in_directory(directory):
+    """Get all files in a directory recursively."""
+    files = []
+    for root, dirs, filenames in os.walk(directory):
+        for filename in filenames:
+            file_path = os.path.join(root, filename)
+            files.append(file_path)
+    return files
+
+def monitor_file_creation(output_dir, check_interval=5, duration=300):
+    """
+    Monitor file creation by periodically checking for new files.
+    
+    Args:
+        output_dir: The expected output directory
+        check_interval: How often to check for new files (in seconds)
+        duration: How long to monitor (in seconds)
+    """
+    output_dir = os.path.abspath(output_dir)
+    current_dir = os.getcwd()
+    
+    logger.info(f"Starting file creation monitoring for {duration} seconds...")
+    logger.info(f"Expected output directory: {output_dir}")
+    logger.info(f"Current working directory: {current_dir}")
+    
+    # Get initial list of files
+    initial_files = set()
+    if os.path.exists(output_dir):
+        initial_files = set(get_files_in_directory(output_dir))
+    
+    # Also check current directory for any files that might be created there
+    current_dir_files = set()
+    if os.path.exists(current_dir):
+        current_dir_files = set(get_files_in_directory(current_dir))
+    
+    start_time = time.time()
+    check_count = 0
+    
+    while time.time() - start_time < duration:
+        check_count += 1
+        logger.info(f"Check #{check_count} - Time elapsed: {time.time() - start_time:.1f}s")
+        
+        # Check for new files in output directory
+        if os.path.exists(output_dir):
+            current_files = set(get_files_in_directory(output_dir))
+            new_files = current_files - initial_files
+            if new_files:
+                logger.info(f"New files in output directory ({len(new_files)}):")
+                for file_path in sorted(new_files):
+                    logger.info(f"  + {file_path}")
+                initial_files = current_files
+        
+        # Check for new files in current directory
+        if os.path.exists(current_dir):
+            current_files = set(get_files_in_directory(current_dir))
+            new_files = current_files - current_dir_files
+            if new_files:
+                logger.warning(f"WARNING: New files in current directory ({len(new_files)}):")
+                for file_path in sorted(new_files):
+                    logger.warning(f"  ! {file_path}")
+                    if not file_path.startswith(output_dir):
+                        logger.error(f"ERROR: File created outside expected output directory!")
+                        logger.error(f"  File: {file_path}")
+                        logger.error(f"  Expected: {output_dir}")
+                current_dir_files = current_files
+        
+        time.sleep(check_interval)
+    
+    logger.info("File creation monitoring completed.")
+    logger.info(f"Total checks performed: {check_count}")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python debug_file_locations.py <output_directory> [duration_seconds] [check_interval_seconds]")
+        sys.exit(1)
+    
+    output_dir = sys.argv[1]
+    duration = int(sys.argv[2]) if len(sys.argv) > 2 else 300
+    check_interval = int(sys.argv[3]) if len(sys.argv) > 3 else 5
+    
+    monitor_file_creation(output_dir, check_interval, duration) 
\ No newline at end of file
diff --git a/evaluation/docker_build.py b/evaluation/docker_build.py
index 53db57a0c..15bda4666 100644
--- a/evaluation/docker_build.py
+++ b/evaluation/docker_build.py
@@ -603,6 +603,13 @@ def build_container(
             command="tail -f /dev/null",
             nano_cpus=nano_cpus,
             platform=test_spec.platform,
+            log_config={
+                "Type": "json-file",
+                "Config": {
+                    "max-size": "10m",
+                    "max-file": "3"
+                }
+            }
         )
 
         logger.info(f"Container for {test_spec.instance_id} created: {container.id}")
@@ -659,6 +666,13 @@ def build_setup_container(
             command="tail -f /dev/null",
             # nano_cpus=nano_cpus,
             platform=test_spec.platform,
+            log_config={
+                "Type": "json-file",
+                "Config": {
+                    "max-size": "10m",
+                    "max-file": "3"
+                }
+            }
         )
 
         logger.info(f"Container for {test_spec.instance_id} created: {container.id}")
diff --git a/requirements.txt b/requirements.txt
index 183b61de0..0c9defb7c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -127,7 +127,7 @@ tree-sitter-c==0.21.4
 tree-sitter-cpp==0.22.2
 tree-sitter-java==0.21.0
 tree-sitter-languages==1.10.2
-triton==2.2.0
+# triton==2.2.0
 types-jsonschema==4.21.0.20240311
 typing_extensions
 tzdata==2025.2
diff --git a/test_results_path_fix.py b/test_results_path_fix.py
new file mode 100644
index 000000000..507ea2fc7
--- /dev/null
+++ b/test_results_path_fix.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test script to verify that results.json files are no longer created in the root directory
+when results_path is None.
+"""
+
+import os
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+
+# Add the app directory to the Python path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'app'))
+
+from agents.agents_manager import AgentsManager
+from task import Task
+from datetime import datetime
+import docker
+
+class MockTask(Task):
+    """Mock task for testing"""
+    def __init__(self):
+        self.repo_name = "test-repo"
+        self.commit = "test-commit"
+        self.version = "test-version"
+        self.test_patch = "test-patch"
+        self._project_path = "/tmp/test-project"
+    
+    @property
+    def project_path(self) -> str:
+        return self._project_path
+    
+    @project_path.setter
+    def project_path(self, value: str) -> None:
+        self._project_path = value
+    
+    def get_issue_statement(self) -> str:
+        return "Test issue statement"
+    
+    def setup_project(self) -> None:
+        pass
+    
+    def reset_project(self) -> None:
+        pass
+
+def test_results_path_fix():
+    """Test that results.json is not created in root directory when results_path is None"""
+    
+    # Create a temporary directory for testing
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Testing in temporary directory: {temp_dir}")
+        
+        # Create a mock task
+        task = MockTask()
+        
+        # Create output directory
+        output_dir = os.path.join(temp_dir, "output")
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Test with results_path=None (the problematic case)
+        try:
+            # This should create results.json in output/results/ instead of root
+            agents_manager = AgentsManager(
+                task=task,
+                output_dir=output_dir,
+                client=docker.from_env(),
+                start_time=datetime.now(),
+                max_iteration_num=1,
+                results_path=None,  # This was causing the issue
+                disable_memory_pool=False,
+                disable_context_retrieval=False,
+                disable_run_test=False
+            )
+            
+            # Check if results.json was created in the expected location
+            expected_results_file = os.path.join(output_dir, "results", "results.json")
+            expected_lock_file = expected_results_file + ".lock"
+            
+            if os.path.exists(expected_results_file):
+                print(f"✅ SUCCESS: results.json created in expected location: {expected_results_file}")
+            else:
+                print(f"❌ FAILED: results.json not found in expected location: {expected_results_file}")
+                return False
+                
+            if os.path.exists(expected_lock_file):
+                print(f"✅ SUCCESS: results.json.lock created in expected location: {expected_lock_file}")
+            else:
+                print(f"❌ FAILED: results.json.lock not found in expected location: {expected_lock_file}")
+                return False
+            
+            # Check that files are NOT created in the current working directory
+            root_results_file = "results.json"
+            root_lock_file = "results.json.lock"
+            
+            if not os.path.exists(root_results_file):
+                print(f"✅ SUCCESS: results.json NOT created in root directory")
+            else:
+                print(f"❌ FAILED: results.json still created in root directory")
+                return False
+                
+            if not os.path.exists(root_lock_file):
+                print(f"✅ SUCCESS: results.json.lock NOT created in root directory")
+            else:
+                print(f"❌ FAILED: results.json.lock still created in root directory")
+                return False
+            
+            print("🎉 All tests passed! The fix is working correctly.")
+            return True
+            
+        except Exception as e:
+            print(f"❌ ERROR: Test failed with exception: {e}")
+            return False
+
+if __name__ == "__main__":
+    success = test_results_path_fix()
+    sys.exit(0 if success else 1) 
\ No newline at end of file
diff --git a/tests/test_docker_name_fix.py b/tests/test_docker_name_fix.py
new file mode 100644
index 000000000..d2e5a3b24
--- /dev/null
+++ b/tests/test_docker_name_fix.py
@@ -0,0 +1,44 @@
+import re
+import pytest
+
+def sanitize_docker_image_name(name: str) -> str:
+    if not name or not str(name).strip():
+        return "swe-task"
+    sanitized = re.sub(r'[^a-z0-9_-]', '-', str(name).lower().strip())
+    if sanitized.startswith('-'):
+        sanitized = 'swe' + sanitized
+    sanitized = sanitized.rstrip('-').rstrip('_').lstrip('_')
+    if not sanitized or sanitized == "swe":
+        return "swe-task"
+    if len(sanitized) > 50:
+        sanitized = sanitized[:50]
+    return sanitized
+
+def test_sanitize_docker_image_name():
+    test_cases = [
+        ("", "swe-task"),
+        (None, "swe-task"),
+        ("   ", "swe-task"),
+        ("kareldb-connection-1", "kareldb-connection-1"),
+        ("My Project", "my-project"),
+        ("test@example.com", "test-example-com"),
+        ("-dockerfile1", "swe-dockerfile1"),
+        ("-test", "swe-test"),
+        ("---", "swe-task"),
+        ("project@#$%^&*()", "project"),
+        ("CamelCase", "camelcase"),
+        ("snake_case", "snake_case"),
+        ("kebab-case", "kebab-case"),
+        ("a" * 100, "a" * 50),
+        ("___test___", "test"),
+    ]
+    for input_name, expected in test_cases:
+        result = sanitize_docker_image_name(input_name)
+        assert result == expected, f"Input: {input_name!r} -> {result!r} (expected: {expected!r})"
+
+def test_image_name_generation():
+    problematic_task_id = ""
+    sanitized_task_id = sanitize_docker_image_name(problematic_task_id)
+    setup_dockerfile_num = 1
+    image_name = f"{sanitized_task_id}-dockerfile{setup_dockerfile_num}:latest"
+    assert not image_name.startswith('-'), f"Image name should not start with hyphen: {image_name}" 
\ No newline at end of file