# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# pyre-strict
# NOTE: Do not add `from __future__ import annotatations` to this file. Adding
# `annotations` postpones evaluation of types and will break FBLearner's usage of
# `BenchmarkResult` as return type annotation, used for serialization and rendering
# in the UI.
from collections.abc import Iterable
from dataclasses import dataclass
import numpy.typing as npt
from ax.core.experiment import Experiment
from ax.utils.common.base import Base
from numpy import nanmean, nanquantile
from pandas import DataFrame
from scipy.stats import sem
PERCENTILES = [0.25, 0.5, 0.75]
[docs]
@dataclass(eq=False)
class BenchmarkResult(Base):
"""The result of a single optimization loop from one
(BenchmarkProblem, BenchmarkMethod) pair.
Args:
name: Name of the benchmark. Should make it possible to determine the
problem and the method.
seed: Seed used for determinism.
oracle_trace: For single-objective problems, element i of the
optimization trace is the best oracle value of the arms evaluated
after the first i trials. For multi-objective problems, element i
of the optimization trace is the hypervolume of the oracle values of
the arms in the first i trials (which may be ``BatchTrial``s).
Oracle values are typically ground-truth (rather than noisy) and
evaluated at the target task and fidelity.
inference_trace: Inference trace comes from choosing a "best" point
based only on data that would be observable in realistic settings
and then evaluating the oracle value of that point. For
multi-objective problems, we find a Pareto set and evaluate its
hypervolume.
There are several ways of specifying the "best" point: One could
pick the point with the best observed value, or the point with the
best model prediction, and could consider the whole search space,
the set of trials completed so far, etc. How the inference trace is
computed is specified by a best-point selector, which is an
attribute of the `BenchmarkMethod`.
Note: This is not "inference regret", which is a lower-is-better value
that is relative to the best possible value. The inference value
trace is higher-is-better if the problem is a maximization problem
or if the problem is multi-objective (in which case hypervolume is
used). Hence, it is signed the same as ``oracle_trace`` and
``optimization_trace``. ``score_trace`` is higher-is-better and
relative to the optimum.
optimization_trace: Either the ``oracle_trace`` or the
``inference_trace``, depending on whether the ``BenchmarkProblem``
specifies ``report_inference_value``. Having ``optimization_trace``
specified separately is useful when we need just one value to
evaluate how well the benchmark went.
score_trace: The scores associated with the problem, typically either
the optimization_trace or inference_value_trace normalized to a
0-100 scale for comparability between problems.
fit_time: Total time spent fitting models.
gen_time: Total time spent generating candidates.
experiment: If not ``None``, the Ax experiment associated with the
optimization that generated this data. Either ``experiment`` or
``experiment_storage_id`` must be provided.
experiment_storage_id: Pointer to location where experiment data can be read.
"""
name: str
seed: int
oracle_trace: npt.NDArray
inference_trace: npt.NDArray
optimization_trace: npt.NDArray
score_trace: npt.NDArray
fit_time: float
gen_time: float
experiment: Experiment | None = None
experiment_storage_id: str | None = None
def __post_init__(self) -> None:
if self.experiment is not None and self.experiment_storage_id is not None:
raise ValueError(
"Cannot specify both an `experiment` and an "
"`experiment_storage_id` for the experiment."
)
if self.experiment is None and self.experiment_storage_id is None:
raise ValueError(
"Must provide an `experiment` or `experiment_storage_id` "
"to construct a BenchmarkResult."
)
[docs]
@dataclass(frozen=True, eq=False)
class AggregatedBenchmarkResult(Base):
"""The result of a benchmark test, or series of replications. Scalar data present
in the BenchmarkResult is here represented as (mean, sem) pairs.
"""
name: str
results: list[BenchmarkResult]
# mean, sem, and quartile columns
optimization_trace: DataFrame
score_trace: DataFrame
# (mean, sem) pairs
fit_time: list[float]
gen_time: list[float]
[docs]
@classmethod
def from_benchmark_results(
cls,
results: list[BenchmarkResult],
) -> "AggregatedBenchmarkResult":
"""Aggregrates a list of BenchmarkResults. For various reasons (timeout, errors,
etc.) each BenchmarkResult may have a different number of trials; aggregated
traces and statistics are computed with and truncated to the minimum trial count
to ensure each replication is included.
"""
# Extract average wall times and standard errors thereof
fit_time, gen_time = (
[nanmean(Ts), float(sem(Ts, ddof=1, nan_policy="propagate"))]
for Ts in zip(*((res.fit_time, res.gen_time) for res in results))
)
# Compute some statistics for each trace
trace_stats = {}
for name in ("optimization_trace", "score_trace"):
step_data = zip(*(getattr(res, name) for res in results))
stats = _get_stats(step_data=step_data, percentiles=PERCENTILES)
trace_stats[name] = stats
# Return aggregated results
return cls(
name=results[0].name,
results=results,
fit_time=fit_time,
gen_time=gen_time,
**{name: DataFrame(stats) for name, stats in trace_stats.items()},
)
def _get_stats(
step_data: Iterable[npt.NDArray],
percentiles: list[float],
) -> dict[str, list[float]]:
quantiles = []
stats = {"mean": [], "sem": []}
for step_vals in step_data:
stats["mean"].append(nanmean(step_vals))
stats["sem"].append(sem(step_vals, ddof=1, nan_policy="propagate"))
quantiles.append(nanquantile(step_vals, q=percentiles))
stats.update({f"P{100 * p:.0f}": q for p, q in zip(percentiles, zip(*quantiles))})
return stats