feat: support for train transformers (#354)

Introduce train transformers. Note: train transformers are fitted first and then the preprocess transformers are fitted next. --------- Signed-off-by: s0nicboOm <[email protected]>
numaproj · Mar 29, 2024 · db2cc4f · db2cc4f
1 parent 377ada2
commit db2cc4f
Show file tree

Hide file tree

Showing 10 changed files with 12,934 additions and 465 deletions.
diff --git a/numalogic/config/_config.py b/numalogic/config/_config.py
@@ -94,8 +94,7 @@ class TrainerConf:
  retry_sec: int = 600 # 10 min
  batch_size: int = 64
  data_freq_sec: int = 60
- # TODO: Support trainer based transform models
- max_value_map: Optional[dict[str, float]] = None
+ transforms: Optional[list[ModelInfo]] = None
  pltrainer_conf: LightningTrainerConf = field(default_factory=LightningTrainerConf)
 
 

diff --git a/numalogic/transforms/_stateless.py b/numalogic/transforms/_stateless.py
@@ -10,7 +10,7 @@
 # limitations under the License.
 
 from collections.abc import Sequence
-from typing import Union, Optional
+from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -78,27 +78,27 @@ def __init__(
  lower: Optional[Union[float, Sequence[float]]] = None,
  upper: Optional[Union[float, Sequence[float]]] = None,
  ):
- self._validate_args(lower, upper)
- self.lower = lower
- self.upper = upper
+ self.lower, self.upper = self._validate_args(lower, upper)
 
  @staticmethod
  def _validate_args(
- lower: Union[float, Sequence[float]], upper: Union[float, Sequence[float]]
- ) -> None:
+ lower: Optional[Union[float, Sequence[float]]],
+ upper: Optional[Union[float, Sequence[float]]],
+ ) -> Optional[tuple[Optional[Union[float, npt.NDArray]], Optional[Union[float, npt.NDArray]]]]:
  if lower is None and upper is None:
  raise ValueError("At least one of lower or upper should be provided.")
-
- if isinstance(lower, Sequence) and isinstance(upper, Sequence) and len(lower) != len(upper):
- raise ValueError("lower and upper should have the same length.")
+ if isinstance(lower, Sequence) and isinstance(upper, Sequence):
+ if len(lower) != len(upper):
+ raise ValueError("lower and upper should have the same length.")
+ lower, upper = np.asarray(lower, dtype=np.float32), np.asarray(upper, dtype=np.float32)
+ if upper is not None and lower is not None and np.any(lower > upper):
+ raise ValueError("lower value should be less than or equal to upper value")
+ return lower, upper
 
  def transform(self, x: npt.NDArray[float], **__) -> npt.NDArray[float]:
  _df = pd.DataFrame(x, dtype=np.float32)
- if (self.lower is not None) and (self.upper is not None):
- return _df.clip(lower=self.lower, upper=self.upper, axis=1).to_numpy(dtype=np.float32)
- if self.upper is not None:
- return _df.clip(upper=self.upper, axis=1).to_numpy(dtype=np.float32)
- return _df.clip(lower=self.lower, axis=1).to_numpy(dtype=np.float32)
+ _df = _df.clip(upper=self.upper, lower=self.lower, axis=1)
+ return _df.to_numpy(dtype=np.float32)
 
 
 class GaussianNoiseAdder(StatelessTransformer):

diff --git a/numalogic/udfs/trainer/_base.py b/numalogic/udfs/trainer/_base.py
@@ -12,13 +12,13 @@
 from torch.utils.data import DataLoader
 
 from numalogic.config import PreprocessFactory, ModelFactory, ThresholdFactory, RegistryFactory
-from numalogic.config._config import NumalogicConf
+from numalogic.config._config import NumalogicConf, ModelInfo
 from numalogic.models.autoencoder import TimeseriesTrainer
 from numalogic.tools.data import StreamingDataset
 from numalogic.tools.exceptions import ConfigNotFoundError, RedisRegistryError
 from numalogic.tools.types import redis_client_t, artifact_t, KEYS, KeyedArtifact
 from numalogic.udfs import NumalogicUDF
-from numalogic.udfs._config import PipelineConf, MLPipelineConf
+from numalogic.udfs._config import PipelineConf
 from numalogic.udfs._metrics import (
  REDIS_ERROR_COUNTER,
  INSUFFICIENT_DATA_COUNTER,
@@ -76,6 +76,7 @@ def compute(
  model: artifact_t,
  input_: npt.NDArray[float],
  preproc_clf: Optional[artifact_t] = None,
+ trainer_transform: Optional[artifact_t] = None,
  threshold_clf: Optional[artifact_t] = None,
  numalogic_cfg: Optional[NumalogicConf] = None,
  ) -> dict[str, KeyedArtifact]:
@@ -86,6 +87,7 @@ def compute(
  model: Model artifact
  input_: Input data
  preproc_clf: Preprocessing artifact
+ trainer_transform: trainer specific preprocessing artifacts
  threshold_clf: Thresholding artifact
  numalogic_cfg: Numalogic configuration
 
@@ -101,6 +103,9 @@ def compute(
  raise ConfigNotFoundError("Numalogic Trainer config not found!")
  dict_artifacts = {}
  trainer_cfg = numalogic_cfg.trainer
+ if trainer_transform:
+ input_ = trainer_transform.fit_transform(input_)
+
  if preproc_clf:
  input_ = preproc_clf.fit_transform(input_)
  dict_artifacts["preproc_clf"] = KeyedArtifact(
@@ -123,6 +128,7 @@ def compute(
 
  if threshold_clf:
  threshold_clf.fit(train_reconerr)
+ _LOGGER.info("Fit data using threshold model")
  dict_artifacts["threshold_clf"] = KeyedArtifact(
  dkeys=[numalogic_cfg.threshold.name],
  artifact=threshold_clf,
@@ -218,9 +224,7 @@ def exec(self, keys: list[str], datum: Datum) -> Messages:
  _LOGGER.info("%s - Data fetched, shape: %s", payload.uuid, df.shape)
 
  # Construct feature array
- x_train, nan_counter, inf_counter = self.get_feature_arr(
- df, _conf.metrics, max_value_map=_conf.numalogic_conf.trainer.max_value_map
- )
+ x_train, nan_counter, inf_counter = self.get_feature_arr(df, _conf.metrics)
  _add_summary(
  summary=NAN_SUMMARY,
  labels=_metric_label_values,
@@ -233,15 +237,17 @@ def exec(self, keys: list[str], datum: Datum) -> Messages:
  )
 
  # Initialize artifacts
- preproc_clf = self._construct_preproc_clf(_conf)
+ preproc_clf = self._construct_clf(_conf.numalogic_conf.preprocess)
+ trainer_transform = self._construct_clf(_conf.numalogic_conf.trainer.transforms)
  model = self._model_factory.get_instance(_conf.numalogic_conf.model)
  thresh_clf = self._thresh_factory.get_instance(_conf.numalogic_conf.threshold)
 
  # Train artifacts
  dict_artifacts = self.compute(
- model,
- x_train,
+ model=model,
+ input_=x_train,
  preproc_clf=preproc_clf,
+ trainer_transform=trainer_transform,
  threshold_clf=thresh_clf,
  numalogic_cfg=_conf.numalogic_conf,
  )
@@ -274,9 +280,11 @@ def exec(self, keys: list[str], datum: Datum) -> Messages:
  )
  return Messages(Message.to_drop())
 
- def _construct_preproc_clf(self, _conf: MLPipelineConf) -> Optional[artifact_t]:
+ def _construct_clf(self, _conf: Optional[list[ModelInfo]]) -> Optional[artifact_t]:
  preproc_clfs = []
- for _cfg in _conf.numalogic_conf.preprocess:
+ if not _conf:
+ return None
+ for _cfg in _conf:
  _clf = self._preproc_factory.get_instance(_cfg)
  preproc_clfs.append(_clf)
  if not preproc_clfs:
@@ -348,7 +356,6 @@ def get_feature_arr(
  raw_df: pd.DataFrame,
  metrics: list[str],
  fill_value: float = 0.0,
- max_value_map: Optional[dict[str, float]] = None,
  ) -> tuple[npt.NDArray[float], float, float]:
  """
  Get feature array from the raw dataframe.
@@ -369,16 +376,7 @@ def get_feature_arr(
  if col not in raw_df.columns:
  raw_df[col] = fill_value
  nan_counter += len(raw_df)
-
  feat_df = raw_df[metrics]
- if max_value_map:
- max_value_list = [max_value_map.get(col, np.nan) for col in metrics]
- feat_df.clip(upper=max_value_list, inplace=True)
- _LOGGER.info(
- "Replaced %s with max_value_map from the map with value of %s.",
- metrics,
- max_value_list,
- )
  nan_counter += raw_df.isna().sum().all()
  inf_counter = np.isinf(feat_df).sum().all()
  feat_df = feat_df.fillna(fill_value).replace([np.inf, -np.inf], fill_value)