chore!: refactor preproc and postproc into transforms module (#201)

- Move preprocess and postprocess files & modules -> transforms - Avoid setting up handler in loggers - Add base classes for extensibility --------- Signed-off-by: Avik Basu <[email protected]>
numaproj · Jun 1, 2023 · 2298be4 · 2298be4
1 parent 7b3531a
commit 2298be4
Show file tree

Hide file tree

Showing 29 changed files with 502 additions and 394 deletions.
diff --git a/Makefile b/Makefile
@@ -27,7 +27,7 @@ setup:
 
 # test your application (tests in the tests/ directory)
 test:
- poetry run pytest tests/
+ poetry run pytest -v tests/
 
 publish:
  @rm -rf dist

diff --git a/docs/post-processing.md b/docs/post-processing.md
@@ -7,16 +7,17 @@ Tanh normalization step is an optional step, where we normalize the anomalies be
 
 ```python
 import numpy as np
-from numalogic.postprocess import tanh_norm
+from numalogic.transforms import tanh_norm
 
 raw_anomaly_score = np.random.randn(10, 2)
 test_anomaly_score_norm = tanh_norm(raw_anomaly_score)
 ```
 
 A scikit-learn compatible API is also available.
+
 ```python
 import numpy as np
-from numalogic.postprocess import TanhNorm
+from numalogic.transforms import TanhNorm
 
 raw_score = np.random.randn(10, 2)
 
@@ -35,7 +36,7 @@ facilitating timely detection and response to potential anomalies.
 
 ```python
 import numpy as np
-from numalogic.postprocess import ExpMovingAverage
+from numalogic.transforms import ExpMovingAverage
 
 raw_score = np.array([1.0, 1.5, 1.2, 3.5, 2.7, 5.6, 7.1, 6.9, 4.2, 1.1]).reshape(-1, 1)
 

diff --git a/docs/pre-processing.md b/docs/pre-processing.md
@@ -20,7 +20,7 @@ Log transformation reduces the variance in some distributions, especially with l
 import numpy as np
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import MinMaxScaler
-from numalogic.preprocess.transformer import LogTransformer
+from numalogic.transforms import LogTransformer
 
 # Generate some random train and test data
 x_train = np.random.randn(100, 3)
@@ -43,7 +43,7 @@ When `add_factor` is provided, each data point x is converted to (x + add_factor
 
 ```python
 import numpy as np
-from numalogic.preprocess.transformer import StaticPowerTransformer
+from numalogic.transforms import StaticPowerTransformer
 
 # Generate some random train and test data
 x_train = np.random.randn(100, 3)
@@ -65,7 +65,7 @@ in the data.
 
 ```python
 import numpy as np
-from numalogic.preprocess import TanhScaler
+from numalogic.transforms import TanhScaler
 
 # Generate some random train and test data
 x_train = np.random.randn(100, 3)

diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -31,7 +31,7 @@ from torch.utils.data import DataLoader
 from numalogic.models.autoencoder import AutoencoderTrainer
 from numalogic.models.autoencoder.variants import VanillaAE
 from numalogic.models.threshold import StdDevThreshold
-from numalogic.postprocess import TanhNorm
+from numalogic.transforms import TanhNorm
 from numalogic.tools.data import StreamingDataset
 
 # Create some synthetic data

diff --git a/examples/numalogic-simple-pipeline/src/udf/postprocess.py b/examples/numalogic-simple-pipeline/src/udf/postprocess.py
@@ -1,7 +1,7 @@
 import logging
 
 import numpy as np
-from numalogic.postprocess import TanhNorm
+from numalogic.transforms import TanhNorm
 from pynumaflow.function import Messages, Message, Datum
 
 from src.utils import Payload

diff --git a/examples/numalogic-simple-pipeline/src/udf/preprocess.py b/examples/numalogic-simple-pipeline/src/udf/preprocess.py
@@ -3,7 +3,7 @@
 import uuid
 
 import numpy as np
-from numalogic.preprocess.transformer import LogTransformer
+from numalogic.transforms import LogTransformer
 from pynumaflow.function import Messages, Message, Datum
 
 from src.utils import Payload

diff --git a/examples/numalogic-simple-pipeline/src/udf/train.py b/examples/numalogic-simple-pipeline/src/udf/train.py
@@ -6,7 +6,7 @@
 from numalogic.models.autoencoder import AutoencoderTrainer
 from numalogic.models.autoencoder.variants import Conv1dAE
 from numalogic.models.threshold import StdDevThreshold
-from numalogic.preprocess.transformer import LogTransformer
+from numalogic.transforms.transformer import LogTransformer
 from numalogic.tools.data import TimeseriesDataModule
 from pynumaflow.function import Datum, Messages, Message
 

diff --git a/numalogic/__init__.py b/numalogic/__init__.py
@@ -15,11 +15,4 @@
 
 LOGGER = logging.getLogger(__name__)
 LOGGER.setLevel(logging.INFO)
-
-stream_handler = logging.StreamHandler()
-stream_handler.setLevel(logging.INFO)
-
-formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
-stream_handler.setFormatter(formatter)
-
-LOGGER.addHandler(stream_handler)
+LOGGER.addHandler(logging.NullHandler())
diff --git a/numalogic/base.py b/numalogic/base.py
@@ -0,0 +1,53 @@
+# Copyright 2022 The Numaproj Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http:https://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base classes for all models and transforms."""
+
+
+from abc import ABCMeta
+
+import numpy.typing as npt
+import pytorch_lightning as pl
+from sklearn.base import TransformerMixin, BaseEstimator, OutlierMixin
+
+
+class BaseTransformer(TransformerMixin, BaseEstimator):
+ """Base class for all transformer classes."""
+
+ pass
+
+
+class StatelessTransformer(BaseTransformer):
+ """Base class for stateless transforms."""
+
+ def transform(self, input_: npt.NDArray, **__):
+ """Implement the transform method."""
+ raise NotImplementedError("transform method not implemented")
+
+ def fit(self, _: npt.NDArray):
+ """Fit method does nothing for stateless transforms."""
+ return self
+
+ def fit_transform(self, input_: npt.NDArray, _=None, **__):
+ """Return the result of the transform method."""
+ return self.transform(input_)
+
+
+class TorchModel(pl.LightningModule, metaclass=ABCMeta):
+ """Base class for all Pytorch based models."""
+
+ pass
+
+
+class BaseThresholdModel(OutlierMixin, BaseEstimator):
+ """Base class for all threshold models."""
+
+ pass
diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py
@@ -39,7 +39,7 @@ class PreprocessFactory(_ObjectFactory):
  """Factory class to create preprocess instances."""
 
  from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
- from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
+ from numalogic.transforms import LogTransformer, StaticPowerTransformer, TanhScaler
 
  _CLS_MAP = {
  "StandardScaler": StandardScaler,
@@ -55,7 +55,7 @@ class PreprocessFactory(_ObjectFactory):
 class PostprocessFactory(_ObjectFactory):
  """Factory class to create postprocess instances."""
 
- from numalogic.postprocess import TanhNorm, ExpMovingAverage
+ from numalogic.transforms import TanhNorm, ExpMovingAverage
 
  _CLS_MAP = {"TanhNorm": TanhNorm, "ExpMovingAverage": ExpMovingAverage}
 

diff --git a/numalogic/models/autoencoder/base.py b/numalogic/models/autoencoder/base.py
@@ -10,15 +10,15 @@
 # limitations under the License.
 
 
-from abc import ABCMeta
 from typing import Any
 
-import pytorch_lightning as pl
 import torch.nn.functional as F
 from torch import Tensor, optim
 
+from numalogic.base import TorchModel
 
-class BaseAE(pl.LightningModule, metaclass=ABCMeta):
+
+class BaseAE(TorchModel):
  r"""Abstract Base class for all Pytorch based autoencoder models for time-series data.
 
  Args:

diff --git a/numalogic/models/forecast/variants/naive.py b/numalogic/models/forecast/variants/naive.py
@@ -4,7 +4,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler, FunctionTransformer
 
-from numalogic.postprocess import tanh_norm
+from numalogic.transforms._postprocess import tanh_norm
 
 
 class BaselineForecaster:

diff --git a/numalogic/models/threshold/_static.py b/numalogic/models/threshold/_static.py
@@ -10,11 +10,12 @@
 # limitations under the License.
 import numpy as np
 import numpy.typing as npt
-from sklearn.base import BaseEstimator
 from typing_extensions import Self
 
+from numalogic.base import BaseThresholdModel
 
-class StaticThreshold(BaseEstimator):
+
+class StaticThreshold(BaseThresholdModel):
  r"""Simple and stateless static thresholding as an estimator.
 
  Values more than upper_limit is considered an outlier,
@@ -65,7 +66,7 @@ def score_samples(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
  return x
 
 
-class SigmoidThreshold(BaseEstimator):
+class SigmoidThreshold(BaseThresholdModel):
  r"""Smooth and stateless static thesholding using sigmoid function as an estimator.
  The values produced.
 

diff --git a/numalogic/models/threshold/_std.py b/numalogic/models/threshold/_std.py
@@ -12,11 +12,12 @@
 
 import numpy as np
 from numpy.typing import NDArray
-from sklearn.base import BaseEstimator
 from typing_extensions import Self
 
+from numalogic.base import BaseThresholdModel
 
-class StdDevThreshold(BaseEstimator):
+
+class StdDevThreshold(BaseThresholdModel):
  r"""Threshold estimator that calculates based on the mean and the std deviation.
 
  Threshold = Mean + (std_factor * Std)
@@ -50,7 +51,7 @@ def std(self):
  def threshold(self):
  return self._threshold
 
- def fit(self, x_train: NDArray[float], y=None) -> Self:
+ def fit(self, x_train: NDArray[float], _=None) -> Self:
  """Fit the estimator on the training set."""
  self._std = np.std(x_train, axis=0)
  self._mean = np.mean(x_train, axis=0)

diff --git a/numalogic/registry/redis_registry.py b/numalogic/registry/redis_registry.py
@@ -5,7 +5,7 @@
 
 from redis.exceptions import RedisError
 
-from numalogic.registry import ArtifactManager, ArtifactData, ArtifactCache
+from numalogic.registry.artifact import ArtifactManager, ArtifactData, ArtifactCache
 from numalogic.registry._serialize import loads, dumps
 from numalogic.tools.exceptions import ModelKeyNotFound, RedisRegistryError
 from numalogic.tools.types import artifact_t, redis_client_t, KEYS, META_T, META_VT
@@ -154,7 +154,7 @@ def __save_artifact(
  production_key = self.__construct_production_key(key)
  pipe.set(name=production_key, value=new_version_key)
  _LOGGER.info(
- "Setting Production key : %d ,to this new key = %s", production_key, new_version_key
+ "Setting Production key : %s ,to this new key = %s", production_key, new_version_key
  )
  serialized_metadata = ""
  if metadata:

diff --git a/numalogic/tools/__init__.py b/numalogic/tools/__init__.py
@@ -1,9 +0,0 @@
-from numpy.typing import ArrayLike
-from sklearn.base import TransformerMixin, BaseEstimator
-
-
-class DataIndependentTransformers(TransformerMixin, BaseEstimator):
- """Base class for stateless transforms."""
-
- def fit(self, _: ArrayLike):
- return self

diff --git a/numalogic/tools/types.py b/numalogic/tools/types.py
@@ -12,16 +12,21 @@
 
 from typing import Union, TypeVar
 from collections.abc import Sequence
-
-from redis.client import AbstractRedis
 from sklearn.base import BaseEstimator
 from torch import nn
 
+
+try:
+ from redis.client import AbstractRedis
+except ImportError:
+ pass
+else:
+ redis_client_t = TypeVar("redis_client_t", bound=AbstractRedis, covariant=True)
+
 artifact_t = TypeVar("artifact_t", bound=Union[nn.Module, BaseEstimator], covariant=True)
 META_T = TypeVar("META_T", bound=dict[str, Union[str, float, int, list, dict]])
 META_VT = TypeVar("META_VT", str, int, float, list, dict)
 EXTRA_T = TypeVar("EXTRA_T", bound=dict[str, Union[str, list, dict]])
-redis_client_t = TypeVar("redis_client_t", bound=AbstractRedis, covariant=True)
 KEYS = TypeVar("KEYS", bound=Sequence[str], covariant=True)
 
 

diff --git a/numalogic/preprocess/__init__.py → numalogic/transforms/__init__.py b/numalogic/preprocess/__init__.py → numalogic/transforms/__init__.py
@@ -9,6 +9,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer, TanhScaler
+"""
+Module to provide timeseries transformations needed for preprocessing,
+feature engineering and postprocessing.
+"""
 
-__all__ = ["LogTransformer", "StaticPowerTransformer", "TanhScaler"]
+from numalogic.transforms._scaler import TanhScaler
+from numalogic.transforms._stateless import LogTransformer, StaticPowerTransformer
+from numalogic.transforms._movavg import ExpMovingAverage, expmov_avg_aggregator
+from numalogic.transforms._postprocess import TanhNorm, tanh_norm
+
+__all__ = [
+ "TanhScaler",
+ "LogTransformer",
+ "StaticPowerTransformer",
+ "ExpMovingAverage",
+ "expmov_avg_aggregator",
+ "TanhNorm",
+ "tanh_norm",
+]