Skip to content

Commit

Permalink
feat: initial version of backtest tool (#282)
Browse files Browse the repository at this point in the history
Backtesting tool currently with the capability of:
- Running univariate analysis on Prometheus data
- Train models
- Generate scores
- Save plots
- CLI tool
- Prometheus data client now uses Threadpool for a 60% speedup on 8 days
of data

---------

Signed-off-by: Avik Basu <[email protected]>
  • Loading branch information
ab93 committed Sep 14, 2023
1 parent 0cdc257 commit a364721
Show file tree
Hide file tree
Showing 34 changed files with 2,071 additions and 688 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ repos:
- id: check-ast
- id: check-case-conflict
- id: check-docstring-first
- repo: https://github.com/python-poetry/poetry
rev: "1.6"
hooks:
- id: poetry-check
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ WORKDIR $PYSETUP_PATH
COPY ./pyproject.toml ./poetry.lock ./

# TODO install cpu/gpu based on args/arch
RUN poetry install --without dev --no-cache --no-root -E numaflow --extras "${INSTALL_EXTRAS}" && \
RUN poetry install --without dev --no-cache --no-root --extras "${INSTALL_EXTRAS}" && \
poetry run pip install --no-cache "torch>=2.0,<3.0" --index-url https://download.pytorch.org/whl/cpu && \
poetry run pip install --no-cache "pytorch-lightning>=2.0<3.0" && \
rm -rf ~/.cache/pypoetry/
Expand All @@ -51,4 +51,4 @@ WORKDIR /app

ENTRYPOINT ["/usr/bin/dumb-init", "--"]

EXPOSE 5000
EXPOSE 5000
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ lint: format

# install all dependencies
setup:
poetry install --with dev,torch --all-extras --no-root
poetry install --with dev,torch --all-extras

# test your application (tests in the tests/ directory)
test:
Expand Down
11 changes: 11 additions & 0 deletions numalogic/backtest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from importlib.util import find_spec


def _validate_req_pkgs():
if (not find_spec("torch")) or (not find_spec("pytorch_lightning")):
raise ModuleNotFoundError(
"Pytorch and/or Pytorch lightning is not installed. Please install them first."
)


_validate_req_pkgs()
102 changes: 102 additions & 0 deletions numalogic/backtest/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright 2022 The Numaproj Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http:https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import shutil
from pathlib import Path
from typing import Annotated
from typing import Optional

import typer

import numalogic.backtest._bt as bt
from numalogic.backtest._clifunc import clear_outputs, train_models, generate_scores

logging.basicConfig(level=logging.INFO)


app = typer.Typer()
app.add_typer(bt.app, name="backtest")


@app.command()
def clear(
appname: Optional[str] = None,
metric: Optional[str] = None,
output_dir: Annotated[Optional[Path], typer.Option()] = None,
all_: Annotated[bool, typer.Option("--all")] = False,
):
"""CLI entrypoint for clearing the backtest output files."""
if not output_dir:
output_dir = os.path.join(os.getcwd(), ".btoutput")

if all_:
print(f"Clearing all the backtest output files in {output_dir}")
try:
shutil.rmtree(output_dir, ignore_errors=False, onerror=None)
except FileNotFoundError:
pass
return

if not (appname and metric):
_msg = "Both appname and metric needs to be provided!"
print(_msg)
return

clear_outputs(appname=appname, metric=metric, output_dir=output_dir)


@app.command()
def train(
data_file: Annotated[Optional[Path], typer.Option()] = None,
col_name: Annotated[Optional[str], typer.Option()] = None,
ts_col_name: Annotated[str, typer.Option()] = "timestamp",
train_ratio: Annotated[float, typer.Option()] = 0.9,
output_dir: Annotated[Optional[Path], typer.Option()] = None,
):
"""CLI entrypoint for training models for the given data."""
if (data_file is None) or (col_name is None):
print("No data file or column name provided!")
raise typer.Abort()

if not output_dir:
output_dir = os.path.join(os.getcwd(), ".btoutput")

train_models(
data_file=data_file,
col_name=col_name,
ts_col_name=ts_col_name,
train_ratio=train_ratio,
output_dir=output_dir,
)


@app.command()
def score(
data_file: Annotated[Optional[Path], typer.Option()] = None,
col_name: Annotated[Optional[str], typer.Option()] = None,
ts_col_name: Annotated[str, typer.Option()] = "timestamp",
model_path: Annotated[Optional[Path], typer.Option()] = None,
test_ratio: Annotated[float, typer.Option()] = 1.0,
):
"""CLI entrypoint for generating scores for the given data."""
if (data_file is None) or (col_name is None):
print("No data file or column name provided!")
raise typer.Abort()

generate_scores(
data_file=data_file,
col_name=col_name,
ts_col_name=ts_col_name,
model_path=model_path,
test_ratio=test_ratio,
)
50 changes: 50 additions & 0 deletions numalogic/backtest/_bt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2022 The Numaproj Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http:https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Annotated, Optional

import typer

from numalogic.backtest._clifunc import univar_backtest, multivar_backtest
from numalogic.backtest._constants import DEFAULT_PROM_LOCALHOST

app = typer.Typer()


@app.command()
def univariate(
namespace: Annotated[str, typer.Argument(help="Namespace name")],
appname: Annotated[str, typer.Argument(help="Application name")],
metric: Annotated[str, typer.Argument(help="The timeseries metric to analyze")],
url: Annotated[
str, typer.Option(envvar="PROM_URL", help="Endpoint URL for datafetching")
] = DEFAULT_PROM_LOCALHOST,
lookback_days: Annotated[int, typer.Option(help="Number of days of data to fetch")] = 8,
output_dir: Annotated[Optional[str], typer.Option(help="Output directory")] = None,
):
"""CLI entry point for backtest run for a single metric."""
if not output_dir:
output_dir = os.path.join(os.getcwd(), ".btoutput")
univar_backtest(
namespace=namespace,
appname=appname,
metric=metric,
url=url,
lookback_days=lookback_days,
output_dir=output_dir,
)


@app.command()
def multivariate():
"""CLI entry point for backtest run for multiple metrics in a multivariate fashion."""
multivar_backtest()
90 changes: 90 additions & 0 deletions numalogic/backtest/_clifunc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright 2022 The Numaproj Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http:https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import shutil
from pathlib import Path
from typing import Union

import pandas as pd

from numalogic.backtest._prom import PromUnivarBacktester
from numalogic.tools.exceptions import DataFormatError


logger = logging.getLogger(__name__)


def univar_backtest(
namespace: str, appname: str, metric: str, url: str, lookback_days: int, output_dir: str
):
"""Run backtest for a single metric."""
backtester = PromUnivarBacktester(
url, namespace, appname, metric, lookback_days=lookback_days, output_dir=output_dir
)
df = backtester.read_data()
backtester.train_models(df)
out_df = backtester.generate_scores(df)
backtester.save_plots(out_df)


def multivar_backtest(*_, **__):
"""Run backtest for multiple metrics in a multivariate fashion."""
raise NotImplementedError


def clear_outputs(appname: str, metric: str, output_dir: str) -> None:
"""Clear the backtest output files."""
_dir = PromUnivarBacktester.get_outdir(appname, metric, outdir=output_dir)
logger.info("Clearing backtest output files in %s", _dir)
shutil.rmtree(_dir, ignore_errors=False, onerror=None)


def train_models(
data_file: Union[Path, str],
col_name: str,
ts_col_name: str,
train_ratio: float,
output_dir: Union[Path, str],
):
"""Train models for the given data."""
backtester = PromUnivarBacktester(
"", "", "", col_name, test_ratio=(1 - train_ratio), output_dir=output_dir
)

df = pd.read_csv(data_file)
try:
df.set_index([ts_col_name], inplace=True)
except KeyError:
raise DataFormatError(f"Timestamp column {ts_col_name} not found in the data!") from None

df.index = pd.to_datetime(df.index)
backtester.train_models(df)


def generate_scores(
data_file: Union[Path, str],
col_name: str,
ts_col_name: str,
model_path: Union[Path, str],
test_ratio: float,
):
"""Generate scores for the given data."""
backtester = PromUnivarBacktester("", "", "", col_name, test_ratio=test_ratio)

df = pd.read_csv(data_file)
try:
df.set_index([ts_col_name], inplace=True)
except KeyError:
raise DataFormatError(f"Timestamp column {ts_col_name} not found in the data!") from None

df.index = pd.to_datetime(df.index)
backtester.generate_scores(df, model_path=model_path)
8 changes: 8 additions & 0 deletions numalogic/backtest/_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import os
from typing import Final

from numalogic._constants import BASE_DIR

DEFAULT_OUTPUT_DIR: Final[str] = os.path.join(BASE_DIR, ".btoutput")
DEFAULT_SEQUENCE_LEN: Final[int] = 12
DEFAULT_PROM_LOCALHOST: Final[str] = "http:https://localhost:9090/"
Loading

0 comments on commit a364721

Please sign in to comment.