Merge 6d0fa80 into 298df24

facebook · Jun 23, 2023 · ca51f0f · ca51f0f
2 parents 298df24 + 6d0fa80
commit ca51f0f
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 13 deletions.
diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
@@ -127,7 +127,7 @@ def benchmark_replication(
     )
 
 
-def benchmark_test(
+def benchmark_one_method_problem(
     problem: BenchmarkProblem,
     method: BenchmarkMethod,
     seeds: Iterable[int],
@@ -140,12 +140,18 @@ def benchmark_test(
     )
 
 
-def benchmark_full_run(
+def benchmark_multiple_problems_methods(
     problems: Iterable[BenchmarkProblem],
     methods: Iterable[BenchmarkMethod],
     seeds: Iterable[int],
 ) -> List[AggregatedBenchmarkResult]:
+    """
+    For each `problem` and `method` in the Cartesian product of `problems` and
+    `methods`, run the replication on each seed in `seeds` and get the results
+    as an `AggregatedBenchmarkResult`, then return a list of each
+    `AggregatedBenchmarkResult`.
+    """
     return [
-        benchmark_test(problem=p, method=m, seeds=seeds)
+        benchmark_one_method_problem(problem=p, method=m, seeds=seeds)
         for p, m in product(problems, methods)
     ]
diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py
@@ -9,7 +9,7 @@
 import numpy as np
 from ax.core.experiment import Experiment
 from ax.utils.common.base import Base
-from numpy import nanmean, ndarray
+from numpy import nanmean, nanquantile, ndarray
 from pandas import DataFrame
 from scipy.stats import sem
 
@@ -18,6 +18,8 @@
 # `BenchmarkResult` as return type annotation, used for serialization and rendering
 # in the UI.
 
+PERCENTILES = [0.25, 0.5, 0.75]
+
 
 @dataclass(frozen=True, eq=False)
 class BenchmarkResult(Base):
@@ -78,7 +80,7 @@ def from_benchmark_results(
         trace_stats = {}
         for name in ("optimization_trace", "score_trace"):
             step_data = zip(*(getattr(res, name) for res in results))
-            stats = _get_stats(step_data=step_data)
+            stats = _get_stats(step_data=step_data, percentiles=PERCENTILES)
             trace_stats[name] = stats
 
         # Return aggregated results
@@ -91,9 +93,15 @@ def from_benchmark_results(
         )
 
 
-def _get_stats(step_data: Iterable[np.ndarray]) -> Dict[str, List[float]]:
+def _get_stats(
+    step_data: Iterable[np.ndarray],
+    percentiles: List[float],
+) -> Dict[str, List[float]]:
+    quantiles = []
     stats = {"mean": [], "sem": []}
     for step_vals in step_data:
         stats["mean"].append(nanmean(step_vals))
         stats["sem"].append(sem(step_vals, ddof=1, nan_policy="propagate"))
+        quantiles.append(nanquantile(step_vals, q=percentiles))
+    stats.update({f"P{100 * p:.0f}": q for p, q in zip(percentiles, zip(*quantiles))})
     return stats
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
@@ -5,9 +5,9 @@
 
 import numpy as np
 from ax.benchmark.benchmark import (
-    benchmark_full_run,
+    benchmark_multiple_problems_methods,
+    benchmark_one_method_problem,
     benchmark_replication,
-    benchmark_test,
 )
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import SingleObjectiveBenchmarkProblem
@@ -58,9 +58,9 @@ def test_replication_moo(self) -> None:
 
         self.assertTrue(np.all(res.score_trace <= 100))
 
-    def test_test(self) -> None:
+    def test_benchmark_one_method_problem(self) -> None:
         problem = get_single_objective_benchmark_problem()
-        agg = benchmark_test(
+        agg = benchmark_one_method_problem(
             problem=problem,
             method=get_sobol_benchmark_method(),
             seeds=(0, 1),
@@ -75,16 +75,23 @@ def test_test(self) -> None:
             "All experiments must have 4 trials",
         )
 
+        for col in ["mean", "P25", "P50", "P75"]:
+            self.assertTrue((agg.score_trace[col] <= 100).all())
+
     @fast_botorch_optimize
-    def test_full_run(self) -> None:
-        aggs = benchmark_full_run(
+    def test_benchmark_multiple_problems_methods(self) -> None:
+        aggs = benchmark_multiple_problems_methods(
             problems=[get_single_objective_benchmark_problem()],
             methods=[get_sobol_benchmark_method(), get_sobol_gpei_benchmark_method()],
             seeds=(0, 1),
         )
 
         self.assertEqual(len(aggs), 2)
 
+        for agg in aggs:
+            for col in ["mean", "P25", "P50", "P75"]:
+                self.assertTrue((agg.score_trace[col] <= 100).all())
+
     def test_timeout(self) -> None:
         problem = SingleObjectiveBenchmarkProblem.from_botorch_synthetic(
             test_problem_class=Branin,
@@ -116,7 +123,9 @@ def test_timeout(self) -> None:
         )
 
         # Each replication will have a different number of trials
-        result = benchmark_test(problem=problem, method=method, seeds=(0, 1, 2, 3))
+        result = benchmark_one_method_problem(
+            problem=problem, method=method, seeds=(0, 1, 2, 3)
+        )
 
         # Test the traces get composited correctly. The AggregatedResult's traces
         # should be the length of the shortest trace in the BenchmarkResults