feat: swe bench harness (#590)

# Motivation Adds a SWE Bench Harness to the codegen agent. # Content - Loads SWE Bench dataset - For each entry in the database a modal instance is created where an agent can run - Output of each agent is stored and tested on modal using `swebench` - documentation in readme Contributions from: - @victorxheng : #521 # Please check the following before marking your PR as ready for review - [x] I have updated the documentation or added new documentation as needed --------- Co-authored-by: jemeza-codegen <[email protected]>
codegen-sh · Feb 21, 2025 · 410ee85 · 410ee85
1 parent f4a8327
commit 410ee85
Show file tree

Hide file tree

Showing 15 changed files with 942 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,9 @@ graph-sitter-types/typings/**
 coverage.json
 tests/integration/verified_codemods/codemod_data/repo_commits.json
 .benchmarks/*
+
+# SWE Bench results
+results.*.json
+codegen-examples/examples/swebench_agent_run/results/*
+codegen-examples/examples/swebench_agent_run/predictions/*
+codegen-examples/examples/swebench_agent_run/logs/*
diff --git a/codegen-examples/examples/swebench_agent_run/.env.template b/codegen-examples/examples/swebench_agent_run/.env.template
@@ -0,0 +1,4 @@
+OPENAI_API_KEY= # Your OpenAI API key
+ANTHROPIC_API_KEY= # Your Anthropic API key
+LANGSMITH_API_KEY= # Your Langsmith API key
+LANGCHAIN_TRACING_V2= # `true` for tracing, `false` for no tracing
diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md
@@ -0,0 +1,33 @@
+# INSTRUCTIONS
+
+1. Create a `.env` file in the root directory and add your API keys.
+
+1. cd into the `codegen-examples/examples/swebench_agent_run` directory
+
+1. Create a `.venv` with `uv venv` and activate it with `source .venv/bin/activate`
+
+1. Install the codegen dependencies with `uv add codegen`
+
+- Note: If you'd like to install the dependencies in the global environment, you can use `uv pip install -e ../../../`. This will allow you to test modifications to the codegen codebase. You will need to run `uv pip install -e ../../../` each time you make changes to the codebase.
+
+5. Ensure that you have a modal account and profile set up. If you don't have one, you can create one at https://modal.com/
+
+1. Activate the appropriate modal profile `uv modal profile activate <profile_name>`
+
+1. Launch the modal app with `uv run modal deploy --env=<env_name> entry_point.py`
+
+1. Run the evaluation with `python run_eval.py` with the desired options:
+
+- ```bash
+  $ python run_eval.py --help
+  Usage: run_eval.py [OPTIONS]
+
+  Options:
+  --use-existing-preds            Use existing predictions instead of
+                                generating new ones.
+  --dataset [princeton-nlp/SWE-bench_Lite|princeton-nlp/SWE-bench|princeton-nlp/SWE-bench-verified]
+                                The dataset to use.
+  --length INTEGER                The number of examples to process.
+  --instance-id TEXT              The instance ID of the example to process.
+  --help                          Show this message and exit.
+  ```
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -0,0 +1,19 @@
+from codegen.extensions.swebench.utils import SweBenchExample
+from codegen.extensions.swebench.harness import run_agent_on_entry
+import modal
+
+image = (
+    modal.Image.debian_slim(python_version="3.13")
+    .apt_install("git")
+    .pip_install("fastapi[standard]")
+    .copy_local_dir("../../../", "/root/codegen", ignore=[".venv", "**/.venv", "tests", "**/tests"])
+    .run_commands("pip install -e /root/codegen")
+)
+
+app = modal.App(name="swebench-agent-run", image=image, secrets=[modal.Secret.from_dotenv()])
+
+
+@app.function(timeout=5 * 60)
+async def run_agent_modal(entry: SweBenchExample):
+    """Modal function to process a single example from the SWE-bench dataset."""
+    return run_agent_on_entry(entry)
diff --git a/codegen-examples/examples/swebench_agent_run/pyproject.toml b/codegen-examples/examples/swebench_agent_run/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "swebench-agent-run"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12, <3.14"
+dependencies = []
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -0,0 +1,168 @@
+import asyncio
+import json
+import traceback
+from pathlib import Path
+import modal
+import click
+from datetime import datetime
+from codegen.extensions.swebench.utils import SWEBenchDataset, get_swe_bench_example, get_swe_bench_examples
+from codegen.extensions.swebench.report import generate_report
+
+PREDS_DNAME = Path(__file__).parent / "predictions"
+LOG_DIR = Path(__file__).parent / "logs"
+
+run_agent_modal = modal.Function.lookup("swebench-agent-run", "run_agent_modal")
+
+
+async def process_batch(examples, batch_size=10):
+    """Process a batch of examples concurrently.
+
+    Args:
+        examples: List of SweBenchExample objects to process
+        batch_size: Number of examples to process concurrently.
+                   Default is 50 which provides good parallelization
+                   while staying well within Modal's limits.
+    """
+    results = []
+
+    # Process examples in batches
+    for i in range(0, len(examples), batch_size):
+        batch = examples[i : i + batch_size]
+
+        # Create tasks for this batch
+        batch_tasks = [run_agent_modal.remote.aio(example) for example in batch]
+
+        # Wait for all tasks in this batch to complete
+        print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
+
+        try:
+            batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+
+            # Store results
+            for example, result in zip(batch, batch_results):
+                error_info = None
+
+                if isinstance(result, Exception):
+                    error_type = type(result).__name__
+                    error_info = {
+                        "error_type": error_type,
+                        "error_message": str(result),
+                        "traceback": traceback.format_exception(type(result), result, result.__traceback__),
+                    }
+
+                    if isinstance(result, modal.exception.Error):
+                        error_info["modal_error_code"] = getattr(result, "code", None)
+                        error_info["modal_error_details"] = getattr(result, "details", None)
+
+                    print(f"Error processing {example.instance_id}:")
+                    print(f"Type: {error_type}")
+                    print(f"Message: {str(result)}")
+                    print("Traceback:")
+                    print("".join(error_info["traceback"]))
+
+                    results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
+                else:
+                    if result is None:
+                        print(f"Warning: Null result for {example.instance_id}")
+                        results.append({"instance_id": example.instance_id, "status": "error", "error_info": {"error_type": "NullResult", "error_message": "Process returned None"}})
+                    else:
+                        results.append(result)
+
+        except Exception as e:
+            print("Batch processing error:")
+            print(f"Type: {type(e).__name__}")
+            print(f"Message: {str(e)}")
+            traceback.print_exc()
+
+            # Mark all examples in the batch as failed
+            for example in batch:
+                results.append(
+                    {
+                        "instance_id": example.instance_id,
+                        "status": "error",
+                        "error_info": {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc(), "batch_failure": True},
+                    }
+                )
+
+    return results
+
+
+async def run_eval(use_existing_preds, dataset, length, instance_id=None):
+    dataset = SWEBenchDataset(dataset)
+    if instance_id:
+        examples = [get_swe_bench_example(instance_id, dataset=dataset)]
+    else:
+        examples = get_swe_bench_examples(dataset=dataset, length=length)
+
+    try:
+        if not use_existing_preds:
+            print(f"Processing {len(examples)} examples...")
+
+            # Create output directory if it doesn't exist
+            PREDS_DNAME.mkdir(exist_ok=True)
+            results_dir = PREDS_DNAME / "results"
+            results_dir.mkdir(exist_ok=True)
+
+            # Create a timestamp for this run
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+            # Process all examples in parallel batches
+            results = await process_batch(examples)
+
+            # Save individual results
+            for result in results:
+                if result and "instance_id" in result:
+                    instance_id = result["instance_id"]
+                    output_file = results_dir / f"{instance_id}.json"
+                    with open(output_file, "w") as f:
+                        json.dump(result, f, indent=4)
+
+            # Save summary file
+            summary_file = results_dir / f"summary_{timestamp}.json"
+            summary = {
+                "timestamp": timestamp,
+                "total_examples": len(examples),
+                "successful": len([r for r in results if r and "status" not in r]),
+                "failed": len([r for r in results if r and "status" in r and r["status"] == "error"]),
+                "error_types": {},
+                "results": results,
+            }
+
+            # Collect error statistics
+            for result in results:
+                if result and "status" in result and result["status"] == "error":
+                    error_type = result.get("error_info", {}).get("error_type", "Unknown")
+                    summary["error_types"][error_type] = summary["error_types"].get(error_type, 0) + 1
+
+            with open(summary_file, "w") as f:
+                json.dump(summary, f, indent=4)
+
+            print("\nProcessing complete!")
+            print(f"Results saved to: {results_dir}")
+            print(f"Summary saved to: {summary_file}")
+            print(f"Successful: {summary['successful']}/{summary['total_examples']}")
+            print(f"Failed: {summary['failed']}/{summary['total_examples']}")
+            if summary["error_types"]:
+                print("\nError type distribution:")
+                for error_type, count in summary["error_types"].items():
+                    print(f"  {error_type}: {count}")
+
+        # Generate Report on Modal
+        generate_report(PREDS_DNAME, LOG_DIR, dataset)
+    except Exception:
+        print("Fatal error in run_eval:")
+        traceback.print_exc()
+        raise
+
+
+@click.command()
+@click.option("--use-existing-preds", is_flag=True, help="Use existing predictions instead of generating new ones.")
+@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
+@click.option("--length", help="The number of examples to process.", type=int, default=10)
+@click.option("--instance-id", help="The instance ID of the example to process.")
+def run_eval_command(use_existing_preds, dataset, length, instance_id):
+    asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
+
+
+if __name__ == "__main__":
+    run_eval_command()
diff --git a/codegen-examples/pyproject.toml b/codegen-examples/pyproject.toml
@@ -31,6 +31,9 @@ dev-dependencies = [
   "deptry>=0.22.0",
 ]
 
+[tool.uv.workspace]
+members = ["examples/swebench_agent_run"]
+
 [tool.pre-commit-uv]
 requirements = ["strict-requirements"]
 

diff --git a/codegen-examples/uv.lock b/codegen-examples/uv.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,6 +69,7 @@ dependencies = [
   "modal>=0.73.45",
   "slack-sdk",
   "langchain-anthropic>=0.3.7",
+  "lox>=0.12.0",
 ]
 
 license = { text = "Apache-2.0" }

diff --git a/src/codegen/extensions/swebench/README.md b/src/codegen/extensions/swebench/README.md
@@ -0,0 +1,29 @@
+## Codegen Harness and Evaluator for SWE Bennch Development Tool
+
+This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard.
+
+It integrates directly into the Codegen agentic framework and can be built on top of.
+
+### Setup
+
+Remember to install all the dependencies for the environment.
+
+### Usage
+
+#### Edit agent.py, your codegen agent
+
+This file contains the main logic for the agent.
+
+The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more.
+
+It is invoked in the harness script.
+
+#### Run harness.py to run the agent
+
+This script will gather the correct dataset, run the agent, and save the results.
+
+#### Run report.py to generate a report
+
+This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image.
+
+There are currently example predictions in the `predictions/results` folder.