diff --git a/evaluation/aider_bench/README.md b/evaluation/aider_bench/README.md index d3d93d29d28a..b3d80ddf6af5 100644 --- a/evaluation/aider_bench/README.md +++ b/evaluation/aider_bench/README.md @@ -59,25 +59,43 @@ You can update the arguments in the script ## Summarize Results ```bash -poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] +poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name] # with optional SKIP_NUM -poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] +poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name] ``` Full example: ```bash -poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl +poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620 ``` This will list the instances that passed and the instances that failed. For each instance, the corresponding set of test cases (which can vary for each instance) are run on the file edited by the agent. We consider an instance to be passed only if ALL test cases are passed. Sometimes even a single failed test case will -cause the entire instance to be marked as filed. +cause the entire instance to be marked as failed. -You can inspect the test_results field in the output json file to know the exact +You can inspect the `test_results` field in the `output.jsonl` file to find the exact outcome of the tests. If there are no syntax or indentation errors, you can -expect to see something like "..F...EF..", where "." means the test case -passed, "E" means there was an error while executing the test case and "F" -means some assertion failed and returned output was not as expected. +expect to see something like "`..F...EF..`", where "`.`" means the test case +passed, "`E`" means there was an error while executing the test case and "`F`" +means some assertion failed and some returned output was not as expected. + +## Visualization + +If the required Python libraries are installed (`matplotlib.pyplot` and `seaborn`), +the `summarize_results.py` script will also generate two histograms to +the output folder. + +### Cost Histogram + +The cost histogram shows the number of successful and failed instances per cost point. + +![Cost Histogram](./examples/cost_histogram.png) + +### Actions Histogram + +The actions histogram shows per number of actions the number of successful and failed instances. + +![Actions Histogram](./examples/actions_histogram.png) diff --git a/evaluation/aider_bench/examples/actions_histogram.png b/evaluation/aider_bench/examples/actions_histogram.png new file mode 100644 index 000000000000..894c60b83381 Binary files /dev/null and b/evaluation/aider_bench/examples/actions_histogram.png differ diff --git a/evaluation/aider_bench/examples/cost_histogram.png b/evaluation/aider_bench/examples/cost_histogram.png new file mode 100644 index 000000000000..da6251da3497 Binary files /dev/null and b/evaluation/aider_bench/examples/cost_histogram.png differ diff --git a/evaluation/aider_bench/scripts/summarize_results.py b/evaluation/aider_bench/scripts/summarize_results.py index 0c4ea5bd5214..89872bee7372 100644 --- a/evaluation/aider_bench/scripts/summarize_results.py +++ b/evaluation/aider_bench/scripts/summarize_results.py @@ -1,6 +1,36 @@ import json +import os import sys +import numpy as np +import pandas as pd + +# Try to import visualization libraries +visualization_available = False +try: + import matplotlib.pyplot as plt + import seaborn as sns + + visualization_available = True +except ImportError: + print( + '\n*** WARNING: libraries matplotlib and/or seaborn are not installed.\n*** Visualization will not be available!\n' + ) + + +def show_usage(): + print( + 'Usage: poetry run python summarize_results.py ' + ) + print( + 'Example:\npoetry run python summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620\n' + ) + + +def print_error(message: str): + print(f'\n***\n*** ERROR: {message}\n***\n') + show_usage() + def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: passed = [] @@ -19,19 +49,87 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: return passed, failed +def visualize_results(json_file_path: str, model: str, output_dir: str): + # based on a Colab notebook by RajMaheshwari + with open(json_file_path, 'r') as f: + data = [json.loads(line) for line in f] + + df = pd.DataFrame.from_records(data) + + df1 = pd.DataFrame() + df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost'] + df1['result'] = ( + df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'}) + ) + df1['actions'] = pd.Series([len(a) - 1 for a in df['history']]) + + passed = np.sum(df1['result'] == 'Pass') + total = df.shape[0] + resolve_rate = round((passed / total) * 100, 2) + + print('Number of passed tests:', f'{passed}/{total}') + + if not visualization_available: + return resolve_rate + + # Cost histogram + plt.figure(figsize=(10, 6)) + bins = 10 + mx = pd.Series.max(df1['cost']) + g = sns.histplot(df1, x='cost', bins=bins, hue='result', multiple='stack') + x_ticks = np.around(np.linspace(0, mx, bins + 1), 3) + g.set_xticks(x_ticks) + g.set_xlabel('Cost in $') + g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'cost_histogram.png')) + plt.close() + + # Actions histogram + plt.figure(figsize=(10, 6)) + bins = np.arange(0, 31, 2) + g = sns.histplot(df1, x='actions', bins=bins, hue='result', multiple='stack') + g.set_xticks(bins) + g.set_xlabel('# of actions') + g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, 'actions_histogram.png')) + plt.close() + + return resolve_rate + + if __name__ == '__main__': - if len(sys.argv) != 2: - print( - 'Usage: poetry run python summarize_results.py ' - ) + if len(sys.argv) != 3: + print_error('Argument(s) missing!') sys.exit(1) + json_file_path = sys.argv[1] + model_name = sys.argv[2] + + if not os.path.exists(json_file_path): + print_error('Output file does not exist!') + sys.exit(1) + if not os.path.isfile(json_file_path): + print_error('Path-to-output-file is not a file!') + sys.exit(1) + + output_dir = os.path.dirname(json_file_path) + if not os.access(output_dir, os.W_OK): + print_error('Output folder is not writable!') + sys.exit(1) + passed_tests, failed_tests = extract_test_results(json_file_path) - succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests)) + resolve_rate = visualize_results(json_file_path, model_name, output_dir) + print( - f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}' + f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%' ) print('PASSED TESTS:') print(passed_tests) print('FAILED TESTS:') print(failed_tests) + print( + '\nVisualization results were saved as cost_histogram.png and actions_histogram.png' + ) + print('in folder: ', output_dir) diff --git a/poetry.lock b/poetry.lock index 380b19175148..28992b1f4d06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -9477,4 +9477,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "29b18fdfb3e2254ca0ef596583ac8befd0d37d76e2aaba279d73e51761972f68" +content-hash = "b7a2c28cf99b0e85de3148ab3edbeaf1e721ad8430f8c57cb0cc7f6ccafc5666" diff --git a/pyproject.toml b/pyproject.toml index 3ad9de127218..7fe5df88900a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,7 +84,6 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] - [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -115,7 +114,6 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" - [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" @@ -125,3 +123,5 @@ swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" } func_timeout = "*" sympy = "*" gdown = "*" +matplotlib = "*" +seaborn = "*"