feat: Sigmoid threshold (#141)

* feat: Sigmoid threshold * chore: support passing run id while saving in mlflow Signed-off-by: Avik Basu <[email protected]>
numaproj · Mar 9, 2023 · ea01b44 · ea01b44
1 parent 9659cbd
commit ea01b44
Show file tree

Hide file tree

Showing 10 changed files with 744 additions and 536 deletions.
diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -20,7 +20,7 @@ pip install numalogic
 
 ## Numalogic as a Library
 
-Numalogic can be used as an independent library, and it provides various ML models and tools. Here, we are using a `AutoencoderPipeline`. Refer to [training section](autoencoders.md) for other available options. 
+Numalogic can be used as an independent library, and it provides various ML models and tools. Here, we are using the `AutoencoderTrainer`. Refer to [training section](autoencoders.md) for other available options. 
 
 In this example, the train data set has numbers ranging from 1-10. Whereas in the test data set, there are data points that go out of this range, which the algorithm should be able to detect as anomalies.
 

diff --git a/docs/threshold.md b/docs/threshold.md
@@ -6,31 +6,22 @@ It is a simple Estimator that extends BaseEstimator.
 Currently, the library supports `StdDevThreshold`. This takes in paramaters `min_thresh` and `std_factor`. This model 
 defines threshold as `mean + 3 * std_factor`. 
 
-
-Fitting the threshold model
 ```python
-# preprocess step
-clf = LogTransformer()
-train_data = clf.fit_transform(X_train)
-test_data = clf.transform(X_test)
+import numpy as np
+from numalogic.models.threshold import StdDevThreshold
 
-# Fitting the Threshold model 
-thresh_clf = StdDevThreshold(std_factor=1.2)
-```
+# Generate positive random data
+x_train = np.abs(np.random.randn(1000, 3))
+x_test = np.abs(np.random.randn(30, 3))
 
-Train the model
-```python
-# Train the Autoencoder model and fit the model on train data
-ae_pl = AutoencoderPipeline(
- model=Conv1dAE(in_channels=1, enc_channels=4), seq_len=8, num_epochs=30
-)
-ae_pl.fit(X_train)
+# Here we want a threshold such that anything 
+# outside 5 deviations from the mean will be anomalous.
+thresh_clf = StdDevThreshold(std_factor=5)
+thresh_clf.fit(x_train)
 
-# predict method returns the reconstruction error
-anomaly_score = ae_pl.predict(X_test)
-```
-Predicting score using the threshold model
-```python
-# Predict final anomaly score using threshold estimator
-anomaly_score = thresh_clf.predict(anomaly_score)
+# Let's get the predictions
+y_pred = thresh_clf.predict(x_test)
+
+# Anomaly scores can be given by, score_samples method
+y_score = thresh_clf.score_samples(x_test)
 ```
diff --git a/numalogic/models/threshold/__init__.py b/numalogic/models/threshold/__init__.py
@@ -1,4 +1,4 @@
 from numalogic.models.threshold._std import StdDevThreshold
-from numalogic.models.threshold._static import StaticThreshold
+from numalogic.models.threshold._static import StaticThreshold, SigmoidThreshold
 
-__all__ = ["StdDevThreshold", "StaticThreshold"]
+__all__ = ["StdDevThreshold", "StaticThreshold", "SigmoidThreshold"]
diff --git a/numalogic/models/threshold/_static.py b/numalogic/models/threshold/_static.py
@@ -8,8 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
+import numpy as np
 import numpy.typing as npt
 from sklearn.base import BaseEstimator
 from typing_extensions import Self
@@ -46,22 +45,70 @@ def fit(self, _: npt.NDArray[float]) -> Self:
  """Does not do anything. Only for API compatibility"""
  return self
 
- def predict(self, x_test: npt.NDArray[float]) -> npt.NDArray[int]:
+ def predict(self, x: npt.NDArray[float]) -> npt.NDArray[int]:
+ """
+ Returns an integer array of same shape as input.
+ 1 denotes anomaly.
+ """
+ y = x.copy()
+ y[x < self.upper_limit] = 0
+ y[x >= self.upper_limit] = 1
+ return y
+
+ def score_samples(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+ """
+ Returns an array of same shape as input
+ with values being anomaly scores.
+ """
+ x = x.copy()
+ x[x < self.upper_limit] = self.inlier_score
+ x[x >= self.upper_limit] = self.outlier_score
+ return x
+
+
+class SigmoidThreshold(BaseEstimator):
+ r"""
+ Smooth and stateless static thesholding using sigmoid function as an estimator.
+ The values produced
+
+ Score is given by:
+ score = score_limit * 1/ exp(-coeff * (x - upper_limit))
+
+ Args:
+ upper_limit: is the desired threshold limit of x
+ slope_factor: determines the slope of the curve
+ score_limit: is the scaler multiplier for the score
+ e.g. a value of 10 means that the output score
+ will be between 0 and 10.
+ """
+ __slots__ = ("upper_limit", "coeff", "score_limit")
+
+ def __init__(self, upper_limit: float, slope_factor: int = 5, score_limit: int = 10):
+ self.upper_limit = float(upper_limit)
+ self.coeff = slope_factor * np.pi
+ self.score_limit = score_limit
+
+ def fit(self, _: npt.NDArray[float]) -> Self:
+ """Does not do anything. Only for API compatibility"""
+ return self
+
+ def predict(self, x: npt.NDArray[float]) -> npt.NDArray[int]:
  """
  Returns an integer array of same shape as input.
  1 denotes anomaly.
+
+ This is calculated as a hard threshold at upper limit.
  """
- y_test = x_test.copy()
- y_test[x_test < self.upper_limit] = 0
- y_test[x_test >= self.upper_limit] = 1
- return y_test
+ y = x.copy()
+ y[x < self.upper_limit] = 0
+ y[x >= self.upper_limit] = 1
+ return y
 
- def score_samples(self, x_test: npt.NDArray[float]) -> npt.NDArray[float]:
+ def score_samples(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
  """
  Returns an array of same shape as input
  with values being anomaly scores.
  """
- x_test = x_test.copy()
- x_test[x_test < self.upper_limit] = self.inlier_score
- x_test[x_test >= self.upper_limit] = self.outlier_score
- return x_test
+ x = x.copy()
+ y = 10 / (1 + np.exp(-self.coeff * (x - self.upper_limit)))
+ return y
diff --git a/numalogic/models/threshold/_std.py b/numalogic/models/threshold/_std.py
@@ -30,7 +30,7 @@ class StdDevThreshold(BaseEstimator):
  min_threshold: clip the threshold value to be above this value
  """
 
- def __init__(self, std_factor: float = 3.0, min_threshold: float = 0.1):
+ def __init__(self, std_factor: float = 3.0, min_threshold: float = 0.0):
  self.std_factor = std_factor
  self.min_threshold = min_threshold
 

diff --git a/numalogic/registry/mlflow_registry.py b/numalogic/registry/mlflow_registry.py
@@ -163,6 +163,7 @@ def save(
  skeys: Sequence[str],
  dkeys: Sequence[str],
  artifact: Artifact,
+ run_id: str = None,
  **metadata: str,
  ) -> Optional[ModelVersion]:
  """
@@ -171,14 +172,15 @@ def save(
  skeys: static key fields as list/tuple of strings
  dkeys: dynamic key fields as list/tuple of strings
  artifact: primary artifact to be saved
+ run_id: mlflow run id
  metadata: additional metadata surrounding the artifact that needs to be saved
 
  Returns:
  mlflow ModelVersion instance
  """
  model_key = self.construct_key(skeys, dkeys)
  try:
- mlflow.start_run()
+ mlflow.start_run(run_id=run_id)
  self.handler.log_model(artifact, "model", registered_model_name=model_key)
  if metadata:
  mlflow.log_params(metadata)