(feat) Add Aider bench output visualizer (All-Hands-AI#3643)

* aider-bench: add visualization to summarize script and readme * added example cost and actions histogram images for readme * moved dependencies to evaluation section
SmartManoj · Aug 29, 2024 · c875a5f · c875a5f
1 parent 717929b
commit c875a5f
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 17 deletions.
diff --git a/evaluation/aider_bench/README.md b/evaluation/aider_bench/README.md
@@ -59,25 +59,43 @@ You can update the arguments in the script
 ## Summarize Results
 
 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
 # with optional SKIP_NUM
-poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file] [model_name]
 ```
 
 Full example:
 
 ```bash
-poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620
 ```
 
 This will list the instances that passed and the instances that failed. For each
 instance, the corresponding set of test cases (which can vary for each instance)
 are run on the file edited by the agent. We consider an instance to be passed
 only if ALL test cases are passed. Sometimes even a single failed test case will
-cause the entire instance to be marked as filed.
+cause the entire instance to be marked as failed.
 
-You can inspect the test_results field in the output json file to know the exact
+You can inspect the `test_results` field in the `output.jsonl` file to find the exact
 outcome of the tests. If there are no syntax or indentation errors, you can
-expect to see something like "..F...EF..", where "." means the test case
-passed, "E" means there was an error while executing the test case and "F"
-means some assertion failed and returned output was not as expected.
+expect to see something like "`..F...EF..`", where "`.`" means the test case
+passed, "`E`" means there was an error while executing the test case and "`F`"
+means some assertion failed and some returned output was not as expected.
+
+## Visualization
+
+If the required Python libraries are installed (`matplotlib.pyplot` and `seaborn`),
+the `summarize_results.py` script will also generate two histograms to
+the output folder.
+
+### Cost Histogram
+
+The cost histogram shows the number of successful and failed instances per cost point.
+
+![Cost Histogram](./examples/cost_histogram.png)
+
+### Actions Histogram
+
+The actions histogram shows per number of actions the number of successful and failed instances.
+
+![Actions Histogram](./examples/actions_histogram.png)
diff --git a/evaluation/aider_bench/examples/actions_histogram.png b/evaluation/aider_bench/examples/actions_histogram.png
diff --git a/evaluation/aider_bench/examples/cost_histogram.png b/evaluation/aider_bench/examples/cost_histogram.png
diff --git a/evaluation/aider_bench/scripts/summarize_results.py b/evaluation/aider_bench/scripts/summarize_results.py
@@ -1,6 +1,36 @@
 import json
+import os
 import sys
 
+import numpy as np
+import pandas as pd
+
+# Try to import visualization libraries
+visualization_available = False
+try:
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    visualization_available = True
+except ImportError:
+    print(
+        '\n*** WARNING: libraries matplotlib and/or seaborn are not installed.\n*** Visualization will not be available!\n'
+    )
+
+
+def show_usage():
+    print(
+        'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file> <model_name>'
+    )
+    print(
+        'Example:\npoetry run python summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620\n'
+    )
+
+
+def print_error(message: str):
+    print(f'\n***\n*** ERROR: {message}\n***\n')
+    show_usage()
+
 
 def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
     passed = []
@@ -19,19 +49,87 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
     return passed, failed
 
 
+def visualize_results(json_file_path: str, model: str, output_dir: str):
+    # based on a Colab notebook by RajMaheshwari
+    with open(json_file_path, 'r') as f:
+        data = [json.loads(line) for line in f]
+
+    df = pd.DataFrame.from_records(data)
+
+    df1 = pd.DataFrame()
+    df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
+    df1['result'] = (
+        df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'})
+    )
+    df1['actions'] = pd.Series([len(a) - 1 for a in df['history']])
+
+    passed = np.sum(df1['result'] == 'Pass')
+    total = df.shape[0]
+    resolve_rate = round((passed / total) * 100, 2)
+
+    print('Number of passed tests:', f'{passed}/{total}')
+
+    if not visualization_available:
+        return resolve_rate
+
+    # Cost histogram
+    plt.figure(figsize=(10, 6))
+    bins = 10
+    mx = pd.Series.max(df1['cost'])
+    g = sns.histplot(df1, x='cost', bins=bins, hue='result', multiple='stack')
+    x_ticks = np.around(np.linspace(0, mx, bins + 1), 3)
+    g.set_xticks(x_ticks)
+    g.set_xlabel('Cost in $')
+    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, 'cost_histogram.png'))
+    plt.close()
+
+    # Actions histogram
+    plt.figure(figsize=(10, 6))
+    bins = np.arange(0, 31, 2)
+    g = sns.histplot(df1, x='actions', bins=bins, hue='result', multiple='stack')
+    g.set_xticks(bins)
+    g.set_xlabel('# of actions')
+    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
+    plt.tight_layout()
+    plt.savefig(os.path.join(output_dir, 'actions_histogram.png'))
+    plt.close()
+
+    return resolve_rate
+
+
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print(
-            'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file>'
-        )
+    if len(sys.argv) != 3:
+        print_error('Argument(s) missing!')
         sys.exit(1)
+
     json_file_path = sys.argv[1]
+    model_name = sys.argv[2]
+
+    if not os.path.exists(json_file_path):
+        print_error('Output file does not exist!')
+        sys.exit(1)
+    if not os.path.isfile(json_file_path):
+        print_error('Path-to-output-file is not a file!')
+        sys.exit(1)
+
+    output_dir = os.path.dirname(json_file_path)
+    if not os.access(output_dir, os.W_OK):
+        print_error('Output folder is not writable!')
+        sys.exit(1)
+
     passed_tests, failed_tests = extract_test_results(json_file_path)
-    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    resolve_rate = visualize_results(json_file_path, model_name, output_dir)
+
     print(
-        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
     )
     print('PASSED TESTS:')
     print(passed_tests)
     print('FAILED TESTS:')
     print(failed_tests)
+    print(
+        '\nVisualization results were saved as cost_histogram.png and actions_histogram.png'
+    )
+    print('in folder: ', output_dir)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -84,7 +84,6 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
-
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -115,7 +114,6 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
-
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
@@ -125,3 +123,5 @@ swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" }
 func_timeout = "*"
 sympy = "*"
 gdown = "*"
+matplotlib = "*"
+seaborn = "*"