Skip to content

Commit

Permalink
sample size calculation (#40)
Browse files Browse the repository at this point in the history
* sample size calculation

* refactor

* multiple test correction
  • Loading branch information
jancervenka committed Sep 19, 2022
1 parent dd45bc7 commit 5cf6139
Show file tree
Hide file tree
Showing 10 changed files with 435 additions and 4 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ep-stats
version = 1.3.2
version = 1.4.0
description = Statistical package to evaluate ab tests in experimentation platform.
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down
2 changes: 2 additions & 0 deletions src/epstats/server/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from starlette.exceptions import HTTPException as StarletteHTTPException

from .api_evaluate import get_evaluate_router
from .api_sample_size_calculation import get_sample_size_calculation_router
from .json_response import DataScienceJsonResponse
from .api_settings import ApiSettings

Expand Down Expand Up @@ -47,6 +48,7 @@ async def readiness_liveness_probe(statsd: StatsClient = Depends(get_statsd)):
return {"message": "ep-stats-api is ready"}

api.include_router(get_evaluate_router(get_dao, get_executor_pool, get_statsd))
api.include_router(get_sample_size_calculation_router(get_executor_pool, get_statsd))

return api

Expand Down
54 changes: 54 additions & 0 deletions src/epstats/server/api_sample_size_calculation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging
from fastapi import APIRouter, Depends, HTTPException
from statsd import StatsClient
import asyncio
from concurrent.futures import ThreadPoolExecutor

from ..toolkit.statistics import Statistics

from .req import SampleSizeCalculationData
from .res import SampleSizeCalculationResult


_logger = logging.getLogger("epstats")


def get_sample_size_calculation_router(get_executor_pool, get_statsd) -> APIRouter:
def _sample_size_calculation(data: SampleSizeCalculationData, statsd: StatsClient):
try:

if data.std is None:
f = Statistics.required_sample_size_per_variant_bernoulli
else:
f = Statistics.required_sample_size_per_variant

sample_size_per_variant = f(**data.dict())

_logger.info((f"Calculation finished, sample_size_per_variant = {sample_size_per_variant}."))
return SampleSizeCalculationResult(sample_size_per_variant=sample_size_per_variant)
except Exception as e:
_logger.error(f"Cannot calculate the sample size because of: '{e}'")
_logger.exception(e)
statsd.incr("errors.sample_size_calculation")
raise HTTPException(
status_code=500,
detail=f"Cannot calculate the sample size because of: '{e}'",
)

router = APIRouter()

@router.post("/sample-size-calculation", response_model=SampleSizeCalculationResult)
async def sample_size_calculation(
data: SampleSizeCalculationData,
evaluation_pool: ThreadPoolExecutor = Depends(get_executor_pool),
statsd: StatsClient = Depends(get_statsd),
):
"""
Calculates sample size based on `data`.
"""
_logger.info(f"Calling the sample size calculation with {data.json()}")
statsd.incr("requests.sample_size_calculation")
loop = asyncio.get_event_loop()
return await loop.run_in_executor(evaluation_pool, _sample_size_calculation, data, statsd)

return router
47 changes: 47 additions & 0 deletions src/epstats/server/req.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ..toolkit import SrmCheck as EvSrmCheck
from ..toolkit import SumRatioCheck as EvSumRatioCheck
from ..toolkit import Parser
from ..toolkit import DEFAULT_CONFIDENCE_LEVEL, DEFAULT_POWER


class Metric(BaseModel):
Expand Down Expand Up @@ -370,3 +371,49 @@ class Config:
],
}
}


class SampleSizeCalculationData(BaseModel):
"""
Data needed for the sample size calculation.
"""

n_variants: int = Field(title="Number of variants", description="Number of variants in the experiment.")

minimum_effect: float = Field(
title="Minimum effect of interest", description="Relative effect, must be greater than zero."
)

mean: float = Field(
title="Current mean",
description="""Estimate of the current population mean. If `std` is empty,
it is assumed that the data comes from Bernoulli distribution. In such case,
`mean` must be between zero and one.""",
)

std: Optional[float] = Field(
title="Current standard deviation",
description="""Estimate of the current population standard deviation. If empty,
it is assumed that the data comes from Bernoulli distribution. In such case,
`mean` must be between zero and one.""",
)

confidence_level: float = Field(
title="Confidence level",
default=DEFAULT_CONFIDENCE_LEVEL,
)

power: float = Field(
title="Power",
default=DEFAULT_POWER,
)

class Config:
schema_extra = {
"example": {
"minimum_effect": 0.1,
"mean": 0.2,
"std": 1.2,
"n_variants": 2,
}
}
17 changes: 17 additions & 0 deletions src/epstats/server/res.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,20 @@ class Config:
},
}
}


class SampleSizeCalculationResult(BaseModel):
"""
Result of the sample size calculation.
"""

sample_size_per_variant: int = Field(
title="Sample size per variant",
)

class Config:
schema_extra = {
"example": {
"sample_size_per_variant": 100_000,
}
}
2 changes: 1 addition & 1 deletion src/epstats/toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .check import Check, SrmCheck, SimpleSrmCheck, SumRatioCheck
from .parser import Parser, EpGoal
from .dao import Dao, DaoFactory
from .statistics import Statistics
from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_POWER
4 changes: 2 additions & 2 deletions src/epstats/toolkit/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .utils import get_utc_timestamp, goals_wide_to_long
from .parser import EpGoal, UnitType, AggType, Goal

from .statistics import Statistics
from .statistics import Statistics, DEFAULT_CONFIDENCE_LEVEL


class Evaluation:
Expand Down Expand Up @@ -131,7 +131,7 @@ def __init__(
date_from: str = None,
date_to: str = None,
date_for: str = None,
confidence_level: float = 0.95,
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
variants: List[str] = None,
statsd: StatsClient = StatsClient(),
filters: List[Filter] = None,
Expand Down
114 changes: 114 additions & 0 deletions src/epstats/toolkit/statistics.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import pandas as pd
import numpy as np
import scipy.stats as st
from typing import Optional
from statsmodels.stats.multitest import multipletests
import warnings

DEFAULT_CONFIDENCE_LEVEL = 0.95
DEFAULT_POWER = 0.8


class Statistics:
"""
Expand Down Expand Up @@ -233,3 +237,113 @@ def obf_alpha_spending_function(cls, confidence_level: int, total_length: int, a
q = st.norm.ppf(1 - alpha / 2) # quantile of normal distribution
alpha_adj = 2 - 2 * st.norm.cdf(q / np.sqrt(t))
return np.round(1 - alpha_adj, decimals=4)

@staticmethod
def required_sample_size_per_variant(
n_variants: int,
minimum_effect: float,
mean: float,
std: float,
std_2: Optional[float] = None,
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
power: float = DEFAULT_POWER,
) -> int:
"""
Computes the sample size required to reach the defined `confidence_level` and `power`.
Uses the following formula:
N = (Z_{1-alpha/2} + Z_{1-beta})^2 * (s_1^2 + s_2^2) / delta^2
When `std_2` is unknown, we assume equal variance:
N = (Z_{1-alpha/2} + Z_{1-beta})^2 * (2 * s^2) / delta^2
For `confidence_level = 0.95` and `power = 0.8`:
N = 7.84 * 2 * s^2 / delta^2
N = 15.7 * s^2 / delta^2
Arguments:
n_variants: number of variants in the experiment
minimum_effect: minimum (relative) effect that we find meaningful to detect
mean: estimate of the current population mean,
also known as rate in case of Bernoulli distribution
std: estimate of the current population standard deviation
std_2: estimate of the treatment population standard deviation
confidence_level: confidence level of the test
power: power of the test
Returns:
required sample size
"""

if minimum_effect < 0:
raise ValueError("minimum_effect must be greater than zero.")

if n_variants < 2:
raise ValueError("There must be at least two variants.")

two_vars = 2 * (std ** 2) if std_2 is None else (std ** 2 + std_2 ** 2)
delta = mean * minimum_effect

alpha = 1 - confidence_level
m = n_variants - 1
alpha = alpha / m # Bonferroni correction
# 7.84 for 80% power and 95% confidence, alpha / 2 for two-sided hypothesis
confidence_and_power = (st.norm.ppf(1 - alpha / 2) + st.norm.ppf(power)) ** 2
samples_size_per_variant = confidence_and_power * (two_vars / delta ** 2)
return round(samples_size_per_variant)

@classmethod
def required_sample_size_per_variant_bernoulli(
cls,
n_variants: int,
minimum_effect: float,
mean: float,
confidence_level: float = DEFAULT_CONFIDENCE_LEVEL,
power: float = DEFAULT_POWER,
**unused_kwargs,
) -> int:
"""
Computes the sample size required to reach the defined `confidence_level`
and `power` when the data follow Bernoulli distribution
Uses the following formula:
N = (Z_{1-alpha/2} + Z_{1-beta})^2 * (s_1^2 + s_2^2) / delta^2
s_1^2 = p_1 * (1 - p_1)
p_2 = p_1 * (1 + mei)
s_2^2 = (p_2 * (1 - p_2)
Arguments:
n_variants: number of variants in the experiment
minimum_effect: minimum (relative) effect that we find meaningful to detect
mean: estimate of the current population mean,
also known as rate in case of Bernoulli distribution
confidence_level: confidence level of the test
power: power of the test
Returns:
required sample size
"""

if mean > 1 or mean < 0:
raise ValueError(f"mean must be between zero and one, received {mean}.")

# if we know minimum effect, we know treatment mean and treatment variance
# see https://github.com/bookingcom/powercalculator/blob/master/src/js/math.js#L113

def get_std(mean):
return np.sqrt(mean * (1 - mean))

mean_2 = mean * (1 + minimum_effect)

return cls.required_sample_size_per_variant(
n_variants=n_variants,
minimum_effect=minimum_effect,
mean=mean,
std=get_std(mean),
std_2=get_std(mean_2),
confidence_level=confidence_level,
power=power,
)
49 changes: 49 additions & 0 deletions tests/epstats/server/test_api_sample_size_calculation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest
from fastapi.testclient import TestClient

from src.epstats.main import api
from src.epstats.main import get_statsd, get_executor_pool

from .depend import get_test_executor_pool, get_test_statsd


client = TestClient(api)
api.dependency_overrides[get_statsd] = get_test_statsd
api.dependency_overrides[get_executor_pool] = get_test_executor_pool


@pytest.mark.parametrize(
"n_variants, minimum_effect, mean, std, expected",
[(2, 0.10, 0.2, 1.2, 56512), (2, 0.05, 0.4, None, 9489), (3, 0.05, 0.4, None, 11492)],
)
def test_sample_size_calculation(n_variants, minimum_effect, mean, std, expected):
json_blob = {
"minimum_effect": minimum_effect,
"mean": mean,
"std": std,
"n_variants": n_variants,
}

resp = client.post("/sample-size-calculation", json=json_blob)
assert resp.status_code == 200
assert resp.json()["sample_size_per_variant"] == expected


@pytest.mark.parametrize(
"n_variants, minimum_effect, mean, expected_message",
[
(2, -0.4, 0.2, "minimum_effect must be greater than zero"),
(2, 0.05, 1.4, "mean must be between zero and one"),
(1, 0.05, 0.2, "must be at least two variants"),
],
)
def test_sample_size_calculation_error(n_variants, minimum_effect, mean, expected_message):
json_blob = {
"minimum_effect": minimum_effect,
"mean": mean,
"n_variants": n_variants,
}

resp = client.post("/sample-size-calculation", json=json_blob)
assert resp.status_code == 500
assert expected_message in resp.content.decode()
Loading

0 comments on commit 5cf6139

Please sign in to comment.