feat: difference transform

Signed-off-by: Avik Basu <[email protected]>
numaproj · Jan 16, 2024 · 5ec5883 · 5ec5883
1 parent 46363d3
commit 5ec5883
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 11 deletions.
diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py
@@ -49,6 +49,7 @@ class PreprocessFactory(_ObjectFactory):
  TanhScaler,
  DataClipper,
  GaussianNoiseAdder,
+ DifferenceTransform,
  )
 
  _CLS_MAP: ClassVar[dict] = {
@@ -61,6 +62,7 @@ class PreprocessFactory(_ObjectFactory):
  "TanhScaler": TanhScaler,
  "DataClipper": DataClipper,
  "GaussianNoiseAdder": GaussianNoiseAdder,
+ "DifferenceTransform": DifferenceTransform,
  }
 
  def get_pipeline_instance(self, objs_info: list[ModelInfo]):
@@ -88,6 +90,7 @@ class ThresholdFactory(_ObjectFactory):
 
  from numalogic.models.threshold import (
  StdDevThreshold,
+ AggStdDevThreshold,
  MahalanobisThreshold,
  RobustMahalanobisThreshold,
  StaticThreshold,
@@ -96,6 +99,7 @@ class ThresholdFactory(_ObjectFactory):
 
  _CLS_MAP: ClassVar[dict] = {
  "StdDevThreshold": StdDevThreshold,
+ "AggStdDevThreshold": AggStdDevThreshold,
  "StaticThreshold": StaticThreshold,
  "SigmoidThreshold": SigmoidThreshold,
  "MahalanobisThreshold": MahalanobisThreshold,

diff --git a/numalogic/models/threshold/__init__.py b/numalogic/models/threshold/__init__.py
@@ -1,9 +1,10 @@
-from numalogic.models.threshold._std import StdDevThreshold
+from numalogic.models.threshold._std import StdDevThreshold, AggStdDevThreshold
 from numalogic.models.threshold._mahalanobis import MahalanobisThreshold, RobustMahalanobisThreshold
 from numalogic.models.threshold._static import StaticThreshold, SigmoidThreshold
 
 __all__ = [
  "StdDevThreshold",
+ "AggStdDevThreshold",
  "StaticThreshold",
  "SigmoidThreshold",
  "MahalanobisThreshold",

diff --git a/numalogic/models/threshold/_mahalanobis.py b/numalogic/models/threshold/_mahalanobis.py
@@ -202,9 +202,10 @@ def __init__(
  self,
  max_outlier_prob: float = 0.1,
  max_inlier_percentile: Optional[float] = None,
+ support_fraction: Optional[float] = 0.7,
  ):
  super().__init__(max_outlier_prob)
- self._mcd = MinCovDet(store_precision=False)
+ self._mcd = MinCovDet(store_precision=False, support_fraction=support_fraction)
  if max_inlier_percentile and (not 75.0 <= max_inlier_percentile < 100.0):
  raise ValueError("max_inlier_percentile should be in range [75, 100)")
  self._max_inlier_percentile = max_inlier_percentile

diff --git a/numalogic/models/threshold/_std.py b/numalogic/models/threshold/_std.py
@@ -8,7 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from typing import Optional
 
 import numpy as np
 from numpy.typing import NDArray
@@ -72,3 +72,18 @@ def predict(self, x_test: NDArray[float]) -> NDArray[int]:
  def score_samples(self, x_test: NDArray[float]) -> NDArray[float]:
  """Returns an anomaly score array with the same shape as input."""
  return x_test / self.threshold
+
+
+class AggStdDevThreshold(StdDevThreshold):
+ def __init__(
+ self,
+ feature_weights: Optional[list[float]] = None,
+ std_factor: float = 3.0,
+ min_threshold: float = 0.0,
+ ):
+ super().__init__(std_factor, min_threshold)
+ self.feature_weights = feature_weights
+
+ def score_samples(self, x_test: NDArray[float]) -> NDArray[float]:
+ scores = super().score_samples(x_test)
+ return np.average(scores, weights=self.feature_weights, axis=1, keepdims=True)
diff --git a/numalogic/synthetic/anomalies.py b/numalogic/synthetic/anomalies.py
@@ -62,7 +62,10 @@ def __init__(
  self.anomaly_type = anomaly_type
  self.anomaly_ratio = anomaly_ratio
  self.anomaly_sign = anomaly_sign
- self.freq = ref_df.index.freq
+ try:
+ self.freq = ref_df.index.freq
+ except AttributeError:
+ self.freq = None
  self.mu, self.sigma = mu, sigma
 
  self.scaler = StandardScaler()
@@ -138,18 +141,22 @@ def _inject_global_anomalies(
  ).merge(anomaly_df, left_index=True, right_index=True)
 
  def _inject_contextual_anomalies(
- self, target_df: pd.DataFrame, cols: Sequence[str], impact=1
+ self,
+ target_df: pd.DataFrame,
+ cols: Sequence[str],
+ impact=1,
  ) -> pd.DataFrame:
  target_df = self._init_target_df(target_df, cols)
  anomaly_df = pd.DataFrame(index=target_df.index)
  anomaly_df["is_anomaly"] = 0
 
  for col in self.__injected_cols:
  tseries = target_df[col]
- sample = tseries[: -self.block_size].sample(1)
- idx_start = sample.index
- idx_end = idx_start + (self.block_size * self.freq)
- outlier_block = tseries[idx_start.values[0] : idx_end.values[0]]
+
+ idx_start = self._find_start_idx(None, target_df)
+ idx_end = idx_start + self.block_size
+
+ outlier_block = tseries[idx_start:idx_end]
 
  # Add gaussian noise to the data
  noise = self._rnd_gen.normal(self.mu, self.sigma, outlier_block.shape)
@@ -172,7 +179,7 @@ def _inject_contextual_anomalies(
  )
 
  anomaly_col = anomaly_df["is_anomaly"]
- anomaly_block = anomaly_col[idx_start.values[0] : idx_end.values[0]]
+ anomaly_block = anomaly_col[idx_start:idx_end]
  anomaly_block += self.add_impact_sign()
 
  return pd.DataFrame(

diff --git a/numalogic/tools/data.py b/numalogic/tools/data.py
@@ -83,7 +83,11 @@ def inverse_window_last_only(batched: Tensor) -> Tensor:
  -------
  A 2D tensor of shape: (new_batch, num_features)
  """
- output = batched[:, -1, :]
+ try:
+ output = batched[:, -1, :]
+ except IndexError:
+ batched = batched.unsqueeze(-1)
+ output = batched[:, -1, :]
  return torch.vstack((batched[0, :-1, :], output))
 
 

diff --git a/numalogic/transforms/__init__.py b/numalogic/transforms/__init__.py
@@ -20,6 +20,7 @@
  StaticPowerTransformer,
  DataClipper,
  GaussianNoiseAdder,
+ DifferenceTransform,
 )
 from numalogic.transforms._movavg import ExpMovingAverage, expmov_avg_aggregator
 from numalogic.transforms._postprocess import TanhNorm, tanh_norm
@@ -34,4 +35,5 @@
  "TanhNorm",
  "tanh_norm",
  "GaussianNoiseAdder",
+ "DifferenceTransform",
 ]
diff --git a/numalogic/transforms/_stateless.py b/numalogic/transforms/_stateless.py
@@ -28,6 +28,8 @@ class LogTransformer(StatelessTransformer):
  add_factor: float value to be added to the feature before taking log.
  """
 
+ __slots__ = ("add_factor",)
+
  def __init__(self, add_factor=2):
  self.add_factor = add_factor
 
@@ -112,6 +114,8 @@ class GaussianNoiseAdder(StatelessTransformer):
  seed: int value to be used as the random seed (default: 42).
  """
 
+ __slots__ = ("_rng", "_is_abs", "_scale")
+
  def __init__(self, scale: float = 1e-8, positive_only: bool = True, seed: int = 42):
  self._rng = np.random.default_rng(seed)
  self._is_abs = positive_only
@@ -122,3 +126,17 @@ def transform(self, x: npt.NDArray[float], **__) -> npt.NDArray[float]:
  if self._is_abs:
  noise = np.abs(noise)
  return x + noise
+
+
+class DifferenceTransform(StatelessTransformer):
+ __slots__ = ("order",)
+
+ def __init__(
+ self,
+ order: int = 1,
+ ):
+ self.order = order
+
+ def transform(self, input_: npt.NDArray, **__):
+ diff_df = pd.DataFrame(input_).diff().bfill()
+ return diff_df.to_numpy(dtype=np.float32)