feat: initial version of backtest tool (#282)

Backtesting tool currently with the capability of: - Running univariate analysis on Prometheus data - Train models - Generate scores - Save plots - CLI tool - Prometheus data client now uses Threadpool for a 60% speedup on 8 days of data --------- Signed-off-by: Avik Basu <[email protected]>
numaproj · Sep 14, 2023 · a364721 · a364721
1 parent 0cdc257
commit a364721
Show file tree

Hide file tree

Showing 34 changed files with 2,071 additions and 688 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,3 +27,7 @@ repos:
  - id: check-ast
  - id: check-case-conflict
  - id: check-docstring-first
+- repo: https://github.com/python-poetry/poetry
+ rev: "1.6"
+ hooks:
+ - id: poetry-check
diff --git a/Dockerfile b/Dockerfile
@@ -41,7 +41,7 @@ WORKDIR $PYSETUP_PATH
 COPY ./pyproject.toml ./poetry.lock ./
 
 # TODO install cpu/gpu based on args/arch
-RUN poetry install --without dev --no-cache --no-root -E numaflow --extras "${INSTALL_EXTRAS}" && \
+RUN poetry install --without dev --no-cache --no-root --extras "${INSTALL_EXTRAS}" && \
  poetry run pip install --no-cache "torch>=2.0,<3.0" --index-url https://download.pytorch.org/whl/cpu && \
  poetry run pip install --no-cache "pytorch-lightning>=2.0<3.0" && \
  rm -rf ~/.cache/pypoetry/
@@ -51,4 +51,4 @@ WORKDIR /app
 
 ENTRYPOINT ["/usr/bin/dumb-init", "--"]
 
-EXPOSE 5000
+EXPOSE 5000
diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ lint: format
 
 # install all dependencies
 setup:
- poetry install --with dev,torch --all-extras --no-root
+ poetry install --with dev,torch --all-extras
 
 # test your application (tests in the tests/ directory)
 test:

diff --git a/numalogic/backtest/__init__.py b/numalogic/backtest/__init__.py
@@ -0,0 +1,11 @@
+from importlib.util import find_spec
+
+
+def _validate_req_pkgs():
+ if (not find_spec("torch")) or (not find_spec("pytorch_lightning")):
+ raise ModuleNotFoundError(
+ "Pytorch and/or Pytorch lightning is not installed. Please install them first."
+ )
+
+
+_validate_req_pkgs()
diff --git a/numalogic/backtest/__main__.py b/numalogic/backtest/__main__.py
@@ -0,0 +1,102 @@
+# Copyright 2022 The Numaproj Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http:https://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import Annotated
+from typing import Optional
+
+import typer
+
+import numalogic.backtest._bt as bt
+from numalogic.backtest._clifunc import clear_outputs, train_models, generate_scores
+
+logging.basicConfig(level=logging.INFO)
+
+
+app = typer.Typer()
+app.add_typer(bt.app, name="backtest")
+
+
+@app.command()
+def clear(
+ appname: Optional[str] = None,
+ metric: Optional[str] = None,
+ output_dir: Annotated[Optional[Path], typer.Option()] = None,
+ all_: Annotated[bool, typer.Option("--all")] = False,
+):
+ """CLI entrypoint for clearing the backtest output files."""
+ if not output_dir:
+ output_dir = os.path.join(os.getcwd(), ".btoutput")
+
+ if all_:
+ print(f"Clearing all the backtest output files in {output_dir}")
+ try:
+ shutil.rmtree(output_dir, ignore_errors=False, onerror=None)
+ except FileNotFoundError:
+ pass
+ return
+
+ if not (appname and metric):
+ _msg = "Both appname and metric needs to be provided!"
+ print(_msg)
+ return
+
+ clear_outputs(appname=appname, metric=metric, output_dir=output_dir)
+
+
+@app.command()
+def train(
+ data_file: Annotated[Optional[Path], typer.Option()] = None,
+ col_name: Annotated[Optional[str], typer.Option()] = None,
+ ts_col_name: Annotated[str, typer.Option()] = "timestamp",
+ train_ratio: Annotated[float, typer.Option()] = 0.9,
+ output_dir: Annotated[Optional[Path], typer.Option()] = None,
+):
+ """CLI entrypoint for training models for the given data."""
+ if (data_file is None) or (col_name is None):
+ print("No data file or column name provided!")
+ raise typer.Abort()
+
+ if not output_dir:
+ output_dir = os.path.join(os.getcwd(), ".btoutput")
+
+ train_models(
+ data_file=data_file,
+ col_name=col_name,
+ ts_col_name=ts_col_name,
+ train_ratio=train_ratio,
+ output_dir=output_dir,
+ )
+
+
+@app.command()
+def score(
+ data_file: Annotated[Optional[Path], typer.Option()] = None,
+ col_name: Annotated[Optional[str], typer.Option()] = None,
+ ts_col_name: Annotated[str, typer.Option()] = "timestamp",
+ model_path: Annotated[Optional[Path], typer.Option()] = None,
+ test_ratio: Annotated[float, typer.Option()] = 1.0,
+):
+ """CLI entrypoint for generating scores for the given data."""
+ if (data_file is None) or (col_name is None):
+ print("No data file or column name provided!")
+ raise typer.Abort()
+
+ generate_scores(
+ data_file=data_file,
+ col_name=col_name,
+ ts_col_name=ts_col_name,
+ model_path=model_path,
+ test_ratio=test_ratio,
+ )
diff --git a/numalogic/backtest/_bt.py b/numalogic/backtest/_bt.py
@@ -0,0 +1,50 @@
+# Copyright 2022 The Numaproj Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http:https://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Annotated, Optional
+
+import typer
+
+from numalogic.backtest._clifunc import univar_backtest, multivar_backtest
+from numalogic.backtest._constants import DEFAULT_PROM_LOCALHOST
+
+app = typer.Typer()
+
+
+@app.command()
+def univariate(
+ namespace: Annotated[str, typer.Argument(help="Namespace name")],
+ appname: Annotated[str, typer.Argument(help="Application name")],
+ metric: Annotated[str, typer.Argument(help="The timeseries metric to analyze")],
+ url: Annotated[
+ str, typer.Option(envvar="PROM_URL", help="Endpoint URL for datafetching")
+ ] = DEFAULT_PROM_LOCALHOST,
+ lookback_days: Annotated[int, typer.Option(help="Number of days of data to fetch")] = 8,
+ output_dir: Annotated[Optional[str], typer.Option(help="Output directory")] = None,
+):
+ """CLI entry point for backtest run for a single metric."""
+ if not output_dir:
+ output_dir = os.path.join(os.getcwd(), ".btoutput")
+ univar_backtest(
+ namespace=namespace,
+ appname=appname,
+ metric=metric,
+ url=url,
+ lookback_days=lookback_days,
+ output_dir=output_dir,
+ )
+
+
+@app.command()
+def multivariate():
+ """CLI entry point for backtest run for multiple metrics in a multivariate fashion."""
+ multivar_backtest()
diff --git a/numalogic/backtest/_clifunc.py b/numalogic/backtest/_clifunc.py
@@ -0,0 +1,90 @@
+# Copyright 2022 The Numaproj Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http:https://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import shutil
+from pathlib import Path
+from typing import Union
+
+import pandas as pd
+
+from numalogic.backtest._prom import PromUnivarBacktester
+from numalogic.tools.exceptions import DataFormatError
+
+
+logger = logging.getLogger(__name__)
+
+
+def univar_backtest(
+ namespace: str, appname: str, metric: str, url: str, lookback_days: int, output_dir: str
+):
+ """Run backtest for a single metric."""
+ backtester = PromUnivarBacktester(
+ url, namespace, appname, metric, lookback_days=lookback_days, output_dir=output_dir
+ )
+ df = backtester.read_data()
+ backtester.train_models(df)
+ out_df = backtester.generate_scores(df)
+ backtester.save_plots(out_df)
+
+
+def multivar_backtest(*_, **__):
+ """Run backtest for multiple metrics in a multivariate fashion."""
+ raise NotImplementedError
+
+
+def clear_outputs(appname: str, metric: str, output_dir: str) -> None:
+ """Clear the backtest output files."""
+ _dir = PromUnivarBacktester.get_outdir(appname, metric, outdir=output_dir)
+ logger.info("Clearing backtest output files in %s", _dir)
+ shutil.rmtree(_dir, ignore_errors=False, onerror=None)
+
+
+def train_models(
+ data_file: Union[Path, str],
+ col_name: str,
+ ts_col_name: str,
+ train_ratio: float,
+ output_dir: Union[Path, str],
+):
+ """Train models for the given data."""
+ backtester = PromUnivarBacktester(
+ "", "", "", col_name, test_ratio=(1 - train_ratio), output_dir=output_dir
+ )
+
+ df = pd.read_csv(data_file)
+ try:
+ df.set_index([ts_col_name], inplace=True)
+ except KeyError:
+ raise DataFormatError(f"Timestamp column {ts_col_name} not found in the data!") from None
+
+ df.index = pd.to_datetime(df.index)
+ backtester.train_models(df)
+
+
+def generate_scores(
+ data_file: Union[Path, str],
+ col_name: str,
+ ts_col_name: str,
+ model_path: Union[Path, str],
+ test_ratio: float,
+):
+ """Generate scores for the given data."""
+ backtester = PromUnivarBacktester("", "", "", col_name, test_ratio=test_ratio)
+
+ df = pd.read_csv(data_file)
+ try:
+ df.set_index([ts_col_name], inplace=True)
+ except KeyError:
+ raise DataFormatError(f"Timestamp column {ts_col_name} not found in the data!") from None
+
+ df.index = pd.to_datetime(df.index)
+ backtester.generate_scores(df, model_path=model_path)
diff --git a/numalogic/backtest/_constants.py b/numalogic/backtest/_constants.py
@@ -0,0 +1,8 @@
+import os
+from typing import Final
+
+from numalogic._constants import BASE_DIR
+
+DEFAULT_OUTPUT_DIR: Final[str] = os.path.join(BASE_DIR, ".btoutput")
+DEFAULT_SEQUENCE_LEN: Final[int] = 12
+DEFAULT_PROM_LOCALHOST: Final[str] = "http:https://localhost:9090/"