Source code for ax.utils.stats.statstools

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

from __future__ import annotations

from logging import Logger
from typing import Union

import numpy as np
import numpy.typing as npt
import pandas as pd
from ax.core.data import Data
from ax.utils.common.logger import get_logger

logger: Logger = get_logger(__name__)
# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
num_mixed = Union[np.ndarray, list[float]]


[docs] def inverse_variance_weight( means: npt.NDArray, variances: npt.NDArray, conflicting_noiseless: str = "warn", ) -> tuple[float, float]: """Perform inverse variance weighting. Args: means: The means of the observations. variances: The variances of the observations. conflicting_noiseless: How to handle the case of multiple observations with zero variance but different means. Options are "warn" (default), "ignore" or "raise". """ if conflicting_noiseless not in {"warn", "ignore", "raise"}: raise ValueError( f"Unsupported option `{conflicting_noiseless}` for conflicting_noiseless." ) if len(means) != len(variances): raise ValueError("Means and variances must be of the same length.") # new_mean = \sum_i 1/var_i mean_i / \sum_i (1/var_i), unless any var = 0, # in which case we report the mean of all values with var = 0. idx_zero = variances == 0 if idx_zero.any(): means_z = means[idx_zero] if np.var(means_z) > 0: message = "Multiple observations zero variance but different means." if conflicting_noiseless == "warn": logger.warning(message) elif conflicting_noiseless == "raise": raise ValueError(message) return np.mean(means_z), 0 inv_vars = np.divide(1.0, variances) sum_inv_vars = inv_vars.sum() new_mean = np.inner(inv_vars, means) / sum_inv_vars new_var = np.divide(1.0, sum_inv_vars) return new_mean, new_var
[docs] def total_variance( means: npt.NDArray, variances: npt.NDArray, sample_sizes: npt.NDArray, ) -> float: """Compute total variance.""" variances = variances * sample_sizes weighted_variance_of_means = np.average( (means - means.mean()) ** 2, weights=sample_sizes ) weighted_mean_of_variance = np.average(variances, weights=sample_sizes) return (weighted_variance_of_means + weighted_mean_of_variance) / sample_sizes.sum()
[docs] def positive_part_james_stein( means: num_mixed, sems: num_mixed, ) -> tuple[npt.NDArray, npt.NDArray]: """Estimation method for Positive-part James-Stein estimator. This method takes a vector of K means (`y_i`) and standard errors (`sigma_i`) and calculates the positive-part James Stein estimator. Resulting estimates are the shrunk means and standard errors. The positive part James-Stein estimator shrinks each constituent average to the grand average: y_i - phi_i * y_i + phi_i * ybar The variable phi_i determines the amount of shrinkage. For `phi_i = 1`, `mu_hat` is equal to `ybar` (the mean of all `y_i`), while for `phi_i = 0`, `mu_hat` is equal to `y_i`. It can be shown that restricting `phi_i <= 1` dominates the unrestricted estimator, so this method restricts `phi_i` in this manner. The amount of shrinkage, `phi_i`, is determined by: (K - 3) * sigma2_i / s2 That is, less shrinkage is applied when individual means are estimated with greater precision, and more shrinkage is applied when individual means are very tightly clustered together. We also restrict `phi_i` to never be larger than 1. The variance of the mean estimator is: (1 - phi_i) * sigma2_i + phi * sigma2_i / K + 2 * phi_i ** 2 * (y_i - ybar)^2 / (K - 3) The first term is the variance component from `y_i`, the second term is the contribution from the mean of all `y_i`, and the third term is the contribution from the uncertainty in the sum of squared deviations of `y_i` from the mean of all `y_i`. For more information, see https://ax.dev/docs/models.html#empirical-bayes-and-thompson-sampling. Args: means: Means of each arm sems: Standard errors of each arm Returns: mu_hat_i: Empirical Bayes estimate of each arm's mean sem_i: Empirical Bayes estimate of each arm's sem """ if np.min(sems) < 0: raise ValueError("sems cannot be negative.") y_i = np.array(means) K = y_i.shape[0] if K < 4: raise ValueError( "Less than 4 measurements passed to positive_part_james_stein. " + "Returning raw estimates." ) sigma2_i = np.power(sems, 2) ybar = np.mean(y_i) s2 = np.var(y_i - ybar, ddof=3) # sample variance normalized by K-3 if s2 == 0: phi_i = 1 else: phi_i = np.minimum(1, sigma2_i / s2) # pyre-fixme[6]: For 1st argument expected `int` but got `floating[typing.Any]`. # pyre-fixme[6]: For 1st argument expected `bool` but got `ndarray[typing.Any, # dtype[typing.Any]]`. mu_hat_i = y_i + phi_i * (ybar - y_i) sigma_hat_i = np.sqrt( # pyre-fixme[58]: `-` is not supported for operand types `int` and # `Union[np.ndarray[typing.Any, np.dtype[typing.Any]], int]`. (1 - phi_i) * sigma2_i + phi_i * sigma2_i / K # pyre-fixme[58]: `*` is not supported for operand types `int` and # `Union[np.ndarray[typing.Any, np.dtype[typing.Any]], int]`. + 2 * phi_i**2 * (y_i - ybar) ** 2 / (K - 3) ) return mu_hat_i, sigma_hat_i
[docs] def relativize( means_t: npt.NDArray | list[float] | float, sems_t: npt.NDArray | list[float] | float, mean_c: float, sem_c: float, bias_correction: bool = True, cov_means: npt.NDArray | list[float] | float = 0.0, as_percent: bool = False, control_as_constant: bool = False, ) -> tuple[npt.NDArray, npt.NDArray]: """Ratio estimator based on the delta method. This uses the delta method (i.e. a Taylor series approximation) to estimate the mean and standard deviation of the sampling distribution of the ratio between test and control -- that is, the sampling distribution of an estimator of the true population value under the assumption that the means in test and control have a known covariance: (mu_t / mu_c) - 1. Under a second-order Taylor expansion, the sampling distribution of the relative change in empirical means, which is `m_t / m_c - 1`, is approximately normally distributed with mean [(mu_t - mu_c) / mu_c] - [(sigma_c)^2 * mu_t] / (mu_c)^3 and variance (sigma_t / mu_c)^2 - 2 * mu_t _ sigma_tc / mu_c^3 + [(sigma_c * mu_t)^2 / (mu_c)^4] as the higher terms are assumed to be close to zero in the full Taylor series. To estimate these parameters, we plug in the empirical means and standard errors. This gives us the estimators: [(m_t - m_c) / m_c] - [(s_c)^2 * m_t] / (m_c)^3 and (s_t / m_c)^2 - 2 * m_t * s_tc / m_c^3 + [(s_c * m_t)^2 / (m_c)^4] Note that the delta method does NOT take as input the empirical standard deviation of a metric, but rather the standard error of the mean of that metric -- that is, the standard deviation of the metric after division by the square root of the total number of observations. Args: means_t: Sample means (test) sems_t: Sample standard errors of the means (test) mean_c: Sample mean (control) sem_c: Sample standard error of the mean (control) bias_correction: Whether to apply bias correction when computing relativized metric values. Uses a second-order Taylor expansion for approximating the means and standard errors of the ratios. cov_means: Sample covariance between test and control as_percent: If true, return results in percent (* 100) control_as_constant: If true, control is treated as a constant. bias_correction, sem_c, and cov_means are ignored when this is true. Returns: rel_hat: Inferred means of the sampling distribution of the relative change `(mean_t - mean_c) / abs(mean_c)` sem_hat: Inferred standard deviation of the sampling distribution of rel_hat -- i.e. the standard error. """ # if mean_c is too small, bail epsilon = 1e-10 if np.any(np.abs(mean_c) < epsilon): raise ValueError( "mean_control ({} +/- {}) is smaller than 1 in 10 billion, " "which is too small to reliably analyze ratios using the delta " "method. This usually occurs because winsorization has truncated " "all values down to zero. Try using a delta type that applies " "no winsorization.".format(mean_c, sem_c) ) m_t = np.array(means_t) s_t = np.array(sems_t) cov_t = np.array(cov_means) abs_mean_c = np.abs(mean_c) r_hat = (m_t - mean_c) / abs_mean_c if control_as_constant: var = (s_t / abs_mean_c) ** 2 else: c = m_t / mean_c if bias_correction: r_hat = r_hat - m_t * sem_c**2 / abs_mean_c**3 # If everything's the same, then set r_hat to zero same = (m_t == mean_c) & (s_t == sem_c) r_hat = ~same * r_hat var = ((s_t**2) - 2 * c * cov_t + (c**2) * (sem_c**2)) / (mean_c**2) if as_percent: return (r_hat * 100, np.sqrt(var) * 100) else: return (r_hat, np.sqrt(var))
[docs] def unrelativize( means_t: npt.NDArray | list[float] | float, sems_t: npt.NDArray | list[float] | float, mean_c: float, sem_c: float, bias_correction: bool = True, cov_means: npt.NDArray | list[float] | float = 0.0, as_percent: bool = False, control_as_constant: bool = False, ) -> tuple[npt.NDArray, npt.NDArray]: """ Reverse operation of ax.utils.stats.statstools.relativize. Args: means_t: Relativized sample means (test) to be unrelativized sems_t: Relativized sample SEM of the means (test) to be unrelativized mean_c: Unrelativized control mean sem_c: Unrelativized control SEM of the mean bias_correction: if `means_t` and `sems_t` are obtained with `bias_correction=True` in ax.utils.stats.statstools.relativize cov_means: Sample covariance between the **unrelativized** test and control as_percent: If true, assuming `means_t` and `sems_t` are percentages (i.e., 1 means 1%). control_as_constant: If true, control is treated as a constant. bias_correction, sem_c, and cov_means are ignored when this is true. Returns: m_t: Inferred sample (test) means in the unrelativized scale s_t: Inferred SEM of sample (test) means in the unrelativized scale """ means_t = np.array(means_t, dtype=float) sems_t = np.array(sems_t, dtype=float) if as_percent: means_t = means_t / 100 sems_t = sems_t / 100 abs_mean_c = np.abs(mean_c) m_t = means_t * abs_mean_c + mean_c if control_as_constant: s_t = sems_t * abs_mean_c else: if bias_correction: m_t = m_t / (1 - (sem_c / abs_mean_c) ** 2) var = sems_t**2 c = m_t / mean_c s_t2 = var * (mean_c**2) + 2 * c * cov_means - (c**2) * (sem_c**2) # This is only positive when sems_t > sem_c * mean_c * (means_t + 1) # If above condition cannot be guaranteed, use control_as_constant = True s_t = np.sqrt(s_t2.clip(min=0.0)) # if means_t is 0.0 exactly, return control mean and sem directly if np.isscalar(means_t): if means_t == 0.0: m_t = mean_c s_t = sem_c else: m_t = np.array(m_t) s_t = np.array(s_t) m_t[means_t == 0.0] = mean_c s_t[means_t == 0.0] = sem_c # pyre-fixme[7]: Expected `Tuple[ndarray[typing.Any, typing.Any], # ndarray[typing.Any, typing.Any]]` but got `Tuple[Union[ndarray[typing.Any, # dtype[typing.Any]], float], Union[ndarray[typing.Any, dtype[typing.Any]], # float]]`. return m_t, s_t
[docs] def agresti_coull_sem( n_numer: pd.Series | npt.NDArray | int, n_denom: pd.Series | npt.NDArray | int, prior_successes: int = 2, prior_failures: int = 2, ) -> npt.NDArray | float: """Compute the Agresti-Coull style standard error for a binomial proportion. Reference: *Agresti, Alan, and Brent A. Coull. Approximate Is Better than 'Exact' for Interval Estimation of Binomial Proportions." The American Statistician, vol. 52, no. 2, 1998, pp. 119-126. JSTOR, www.jstor.org/stable/2685469.* """ n_numer = np.array(n_numer) n_denom = np.array(n_denom) p_for_sem = (n_numer + prior_successes) / ( n_denom + prior_successes + prior_failures ) sem = np.sqrt(p_for_sem * (1 - p_for_sem) / n_denom) return sem
[docs] def marginal_effects(df: pd.DataFrame) -> pd.DataFrame: """ This method calculates the relative (in %) change in the outcome achieved by using any individual factor level versus randomizing across all factor levels. It does this by estimating a baseline under the experiment by marginalizing over all factors/levels. For each factor level, then, it conditions on that level for the individual factor and then marginalizes over all levels for all other factors. Args: df: Dataframe containing columns named mean and sem. All other columns are assumed to be factors for which to calculate marginal effects. Returns: A dataframe containing columns "Name", "Level", "Beta" and "SE" corresponding to the factor, level, effect and standard error. Results are relativized as percentage changes. """ covariates = [col for col in df.columns if col not in ["mean", "sem"]] formatted_vals = [] overall_mean, overall_sem = inverse_variance_weight( df["mean"], np.power(df["sem"], 2), ) for cov in covariates: if len(df[cov].unique()) <= 1: next df_gb = df.groupby(cov) for name, group_df in df_gb: group_mean, group_var = inverse_variance_weight( group_df["mean"], np.power(group_df["sem"], 2) ) effect, effect_sem = relativize( group_mean, np.sqrt(group_var), overall_mean, overall_sem, cov_means=0.0, as_percent=True, ) formatted_vals.append( {"Name": cov, "Level": name, "Beta": effect, "SE": effect_sem} ) return pd.DataFrame(formatted_vals)[["Name", "Level", "Beta", "SE"]]
[docs] def relativize_data( data: Data, status_quo_name: str = "status_quo", as_percent: bool = False, include_sq: bool = False, bias_correction: bool = True, control_as_constant: bool = False, ) -> Data: """Relativize a data object w.r.t. a status_quo arm. Args: data: The data object to be relativized. status_quo_name: The name of the status_quo arm. as_percent: If True, return results as percentage change. include_sq: Include status quo in final df. bias_correction: Whether to apply bias correction when computing relativized metric values. Uses a second-order Taylor expansion for approximating the means and standard errors or the ratios, see ax.utils.stats.statstools.relativize for more details. control_as_constant: If true, control is treated as a constant. bias_correction is ignored when this is true. Returns: The new data object with the relativized metrics (excluding the status_quo arm) """ df = data.df.copy() grp_cols = list( {"trial_index", "metric_name", "random_split"}.intersection(df.columns.values) ) grouped_df = df.groupby(grp_cols) dfs = [] for grp in grouped_df.groups.keys(): subgroup_df = grouped_df.get_group(grp) is_sq = subgroup_df["arm_name"] == status_quo_name sq_mean, sq_sem = ( subgroup_df[is_sq][["mean", "sem"]].drop_duplicates().values.flatten() ) # rm status quo from final df to relativize if not include_sq: subgroup_df = subgroup_df[~is_sq] means_rel, sems_rel = relativize( means_t=subgroup_df["mean"].values, sems_t=subgroup_df["sem"].values, mean_c=sq_mean, sem_c=sq_sem, as_percent=as_percent, bias_correction=bias_correction, control_as_constant=control_as_constant, ) dfs.append( pd.concat( [ subgroup_df.drop(["mean", "sem"], axis=1), pd.DataFrame( np.array([means_rel, sems_rel]).T, columns=["mean", "sem"], index=subgroup_df.index, ), ], axis=1, ) ) df_rel = pd.concat(dfs, axis=0) if include_sq: df_rel.loc[df_rel["arm_name"] == status_quo_name, "sem"] = 0.0 return Data(df_rel)