fix: Sparse AE for vanilla and conv (#199)

- Fix Sparsity calculation for Vanilla and Convolutional AE - Update benchmarks - Improve streaming dataset to support slicing Signed-off-by: Avik Basu <[email protected]>
numaproj · Jun 2, 2023 · a2b00c1 · a2b00c1
1 parent 5e69f5f
commit a2b00c1
Show file tree

Hide file tree

Showing 22 changed files with 305 additions and 108 deletions.
diff --git a/benchmarks/kpi/README.md b/benchmarks/kpi/README.md
@@ -12,12 +12,14 @@ The performance table is shown below, although note that the hyperparameters hav
 The hyperparams used are available inside the results directory under each algorithm.
 
 
-| KPI ID | KPI index | Algorithm | ROC-AUC |
-|--------------------------------------|-----------|---------------|---------|
-| 431a8542-c468-3988-a508-3afd06a218da | 14 | VanillaAE | 0.89 |
-| 431a8542-c468-3988-a508-3afd06a218da | 14 | Conv1dAE | 0.88 |
-| 431a8542-c468-3988-a508-3afd06a218da | 14 | LSTMAE | 0.86 |
-| 431a8542-c468-3988-a508-3afd06a218da | 14 | TransformerAE | 0.82 |
+| KPI ID | KPI index | Algorithm | ROC-AUC (test set) |
+|--------------------------------------|-----------|-----------------|--------------------|
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | VanillaAE | 0.89 |
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | Conv1dAE | 0.88 |
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | LSTMAE | 0.86 |
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | TransformerAE | 0.82 |
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | SparseVanillaAE | 0.93 |
+| 431a8542-c468-3988-a508-3afd06a218da | 14 | SparseConv1dAE | 0.77 |
 
 
 Full credit to Zeyan Li et al. for constructing large-scale real world benchmark datasets for AIOps.

diff --git a/benchmarks/kpi/benchmark.ipynb b/benchmarks/kpi/benchmark.ipynb
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/hyperparams.json b/benchmarks/kpi/results/kpi_idx_14/sparseconv/hyperparams.json
@@ -0,0 +1,15 @@
+{
+ "BATCH_SIZE": 64,
+ "SPLIT_RATIOS": [0.5, 0.2, 0.3],
+ "TRAINER": {"accelerator": "cpu", "max_epochs": 50},
+ "MODEL": {
+ "name": "SparseConv1dAE",
+ "conf": {
+ "seq_len": 12,
+ "in_channels": 1,
+ "enc_channels": [8, 16, 32],
+ "enc_kernel_sizes": [3, 3, 3],
+ "weight_decay": 1e-6
+ }
+ }
+}
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/pr_curve_test.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/pr_curve_test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/roc_test.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/roc_test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/roc_val.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/roc_val.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/test.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/train.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/train.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparseconv/val.png b/benchmarks/kpi/results/kpi_idx_14/sparseconv/val.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/hyperparams.json b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/hyperparams.json
@@ -0,0 +1,20 @@
+{
+ "BATCH_SIZE": 64,
+ "SPLIT_RATIOS": [
+ 0.5,
+ 0.2,
+ 0.3
+ ],
+ "TRAINER": {
+ "accelerator": "cpu",
+ "max_epochs": 50
+ },
+ "MODEL": {
+ "name": "SparseVanillaAE",
+ "conf": {
+ "seq_len": 10,
+ "encoder_layersizes": [16, 32],
+ "decoder_layersizes": [32, 16]
+ }
+ }
+}
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/pr_curve_test.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/pr_curve_test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/roc_test.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/roc_test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/roc_val.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/roc_val.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/test.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/test.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/train.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/train.png
diff --git a/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/val.png b/benchmarks/kpi/results/kpi_idx_14/sparsevanilla/val.png
diff --git a/numalogic/models/autoencoder/trainer.py b/numalogic/models/autoencoder/trainer.py
@@ -20,12 +20,26 @@
 from torch import Tensor
 
 from numalogic.tools.callbacks import ProgressDetails
-from numalogic.tools.data import TimeseriesDataModule
+from numalogic.tools.data import inverse_window
 
 _LOGGER = logging.getLogger(__name__)
 
 
 class AutoencoderTrainer(Trainer):
+ r"""
+ A PyTorch Lightning Trainer for Autoencoder models.
+
+ Args:
+ max_epochs: The maximum number of epochs to train for. (default: 100)
+ logger: The logger to use. (default: False)
+ check_val_every_n_epoch: The number of epochs between validation checks. (default: 5)
+ enable_checkpointing: Whether to enable checkpointing. (default: False)
+ enable_progress_bar: Whether to enable the progress bar. (default: False)
+ enable_model_summary: Whether to enable the model summary. (default: False)
+ callbacks: A list of callbacks to use. (default: None)
+ **trainer_kw: Additional keyword arguments to pass to the Lightning Trainer.
+ """
+
  def __init__(
  self,
  max_epochs=100,
@@ -55,8 +69,17 @@ def __init__(
  )
 
  def predict(self, model: pl.LightningModule = None, unbatch=True, **kwargs) -> Tensor:
+ r"""
+ Predicts the output of the model.
+
+ Args:
+ model: The model to predict with. (default: None)
+ unbatch: Whether to inverse window the output. (default: True)
+ **kwargs: Additional keyword arguments to pass to the Lightning
+ trainers predict method.
+ """
  recon_err = super().predict(model, **kwargs)
  recon_err = torch.vstack(recon_err)
  if unbatch:
- return TimeseriesDataModule.unbatch_sequences(recon_err)
+ return inverse_window(recon_err, method="keep_last")
  return recon_err
diff --git a/numalogic/models/autoencoder/variants/conv.py b/numalogic/models/autoencoder/variants/conv.py
@@ -16,6 +16,7 @@
 
 import torch
 from torch import nn, Tensor
+from torch.distributions import kl_divergence, Bernoulli
 from torch.nn.init import calculate_gain
 
 from numalogic.models.autoencoder.base import BaseAE
@@ -227,12 +228,12 @@ def __init__(
  if isinstance(enc_kernel_sizes, int):
  enc_kernel_sizes = [enc_kernel_sizes for _ in range(len(enc_channels))]
 
- elif isinstance(enc_kernel_sizes, (tuple, list)):
+ elif isinstance(enc_kernel_sizes, Sequence):
  assert len(enc_channels) == len(
  enc_kernel_sizes
  ), "enc_channels and enc_kernel_sizes should be of the same length"
  else:
- raise TypeError(f"Invalid enc_kernel_sizes type provided: {enc_kernel_sizes}")
+ raise TypeError(f"Invalid enc_kernel_sizes type provided: {type(enc_kernel_sizes)}")
 
  self.encoder = Encoder(
  num_channels=enc_channels,
@@ -301,8 +302,8 @@ class SparseConv1dAE(Conv1dAE):
  <https://web.stanford.edu/class/cs294a/sparseAutoencoder.pdf>
 
  Args:
- beta: regularization parameter (Defaults to 1e-3)
- rho: sparsity parameter value (Defaults to 0.05)
+ beta: Penalty factor (Defaults to 1e-3)
+ rho: Sparsity parameter value (Defaults to 0.05)
  **kwargs: VanillaAE kwargs
  """
 
@@ -324,13 +325,20 @@ def kl_divergence(self, activations: Tensor) -> Tensor:
  """
  rho_hat = torch.mean(activations, dim=0)
  rho = torch.full(rho_hat.size(), self.rho, device=self.device)
- kl_loss = nn.KLDivLoss(reduction="sum")
- _dim = 0 if rho_hat.dim() == 1 else 1
- return kl_loss(torch.log_softmax(rho_hat, dim=_dim), torch.softmax(rho, dim=_dim))
+ kl_loss = kl_divergence(
+ Bernoulli(logits=torch.log(rho)), Bernoulli(logits=torch.log(rho_hat))
+ )
+ return torch.sum(torch.clamp(kl_loss, max=1.0))
 
- def _get_reconstruction_loss(self, batch):
+ def _get_reconstruction_loss(self, batch) -> Tensor:
  latent, recon = self.forward(batch)
  batch = batch.view(-1, self.in_channels, self.seq_len)
  loss = self.criterion(batch, recon)
  penalty = self.kl_divergence(latent)
- return loss + penalty
+ return loss + (self.beta * penalty)
+
+ def validation_step(self, batch: Tensor, batch_idx: int) -> Tensor:
+ recon = self.reconstruction(batch)
+ loss = self.criterion(batch, recon.view(-1, self.seq_len, self.in_channels))
+ self._total_val_loss += loss.detach().item()
+ return loss
diff --git a/numalogic/models/autoencoder/variants/vanilla.py b/numalogic/models/autoencoder/variants/vanilla.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch import nn, Tensor
+from torch.distributions import kl_divergence, Bernoulli
 
 from numalogic.models.autoencoder.base import BaseAE
 from numalogic.tools.exceptions import LayerSizeMismatchError
@@ -68,7 +69,7 @@ def _construct_layers(self, layersizes: Sequence[int]) -> nn.ModuleList:
  [
  nn.Linear(start_layersize, layersizes[-1]),
  nn.BatchNorm1d(self.n_features),
- nn.LeakyReLU(),
+ nn.ReLU(),
  ]
  )
  return layers
@@ -216,8 +217,8 @@ class SparseVanillaAE(VanillaAE):
  <https://web.stanford.edu/class/cs294a/sparseAutoencoder.pdf>
 
  Args:
- beta: regularization parameter (Defaults to 1e-3)
- rho: sparsity parameter value (Defaults to 0.05)
+ beta: Regularization factor (Defaults to 1e-3)
+ rho: Sparsity parameter value (Defaults to 0.05)
  **kwargs: VanillaAE kwargs
  """
 
@@ -239,13 +240,21 @@ def kl_divergence(self, activations: Tensor) -> Tensor:
  """
  rho_hat = torch.mean(activations, dim=0)
  rho = torch.full(rho_hat.size(), self.rho, device=self.device)
- kl_loss = nn.KLDivLoss(reduction="sum")
- _dim = 0 if rho_hat.dim() == 1 else 1
- return kl_loss(torch.log_softmax(rho_hat, dim=_dim), torch.softmax(rho, dim=_dim))
+ kl_loss = kl_divergence(
+ Bernoulli(logits=torch.log(rho)), Bernoulli(logits=torch.log(rho_hat))
+ )
+ return torch.sum(torch.clamp(kl_loss, max=1.0))
 
- def _get_reconstruction_loss(self, batch):
+ def _get_reconstruction_loss(self, batch: Tensor) -> Tensor:
  latent, recon = self.forward(batch)
  x = batch.view(-1, self.n_features, self.seq_len)
  loss = self.criterion(x, recon)
  penalty = self.kl_divergence(latent)
- return loss + penalty
+ return loss + (self.beta * penalty)
+
+ def validation_step(self, batch: Tensor, batch_idx: int) -> Tensor:
+ recon = self.reconstruction(batch)
+ recon = recon.view(-1, self.seq_len, self.n_features)
+ loss = self.criterion(batch, recon)
+ self._total_val_loss += loss.detach().item()
+ return loss
diff --git a/numalogic/tools/data.py b/numalogic/tools/data.py
@@ -10,7 +10,7 @@
 # limitations under the License.
 
 import logging
-from typing import Optional
+from typing import Optional, Union
 from collections.abc import Generator, Iterator
 
 import numpy as np
@@ -26,6 +26,59 @@
 _LOGGER = logging.getLogger(__name__)
 
 
+def inverse_window(batched: Tensor, method="keep_last") -> Tensor:
+ r"""
+ Utility method to transform a 3D tensor of shape: (batch_size, seq_len, num_features)
+ back into a shape of (new_batch, num_features).
+
+ Args:
+ batched: A 3D tensor of shape: (batch_size, seq_len, num_features)
+ method: The method to use for the inverse transformation. (default: "keep_last")
+ Valid methods are: "keep_last", "keep_first"
+ Returns:
+ A 2D tensor of shape: (new_batch, num_features)
+ """
+ if method == "keep_last":
+ return inverse_window_last_only(batched)
+ if method == "keep_first":
+ return inverse_window_first_only(batched)
+ raise ValueError(f"Invalid method: {method}")
+
+
+def inverse_window_first_only(batched: Tensor) -> Tensor:
+ r"""
+ Utility method to transform a 3D tensor of shape: (batch_size, seq_len, num_features)
+ back into a shape of (new_batch, num_features).
+
+ Note: This is an approximate inverse transormation as only the
+ first element in seq_len is used for the first (new_batch - seq_len - 1) rows.
+
+ Args:
+ batched: A 3D tensor of shape: (batch_size, seq_len, num_features)
+ Returns:
+ A 2D tensor of shape: (new_batch, num_features)
+ """
+ output = batched[:, 0, :]
+ return torch.vstack((output, batched[-1, 1::]))
+
+
+def inverse_window_last_only(batched: Tensor) -> Tensor:
+ r"""
+ Utility method to transform a 3D tensor of shape: (batch_size, seq_len, num_features)
+ back into a shape of (new_batch, num_features).
+
+ Note: This is an approximate inverse transormation as only the
+ last element in seq_len is used for the last (new_batch - seq_len - 1) rows.
+
+ Args:
+ batched: A 3D tensor of shape: (batch_size, seq_len, num_features)
+ Returns:
+ A 2D tensor of shape: (new_batch, num_features)
+ """
+ output = batched[:, -1, :]
+ return torch.vstack((batched[0, :-1, :], output))
+
+
 class StreamingDataset(IterableDataset):
  r"""
  An iterable Dataset designed for streaming time series input.
@@ -93,10 +146,19 @@ def __len__(self) -> int:
  """
  return len(self._data) - self._seq_len + 1
 
- def __getitem__(self, idx: int) -> npt.NDArray[float]:
+ def __getitem__(self, idx: Union[int, slice]) -> npt.NDArray[float]:
  r"""
  Retrieves a sequence from the input data at the specified index.
  """
+ if isinstance(idx, slice):
+ if idx.step is not None:
+ raise ValueError("Slice with step is not supported in StreamingDataset")
+ output = []
+ start = idx.start or 0
+ stop = idx.stop or len(self)
+ for i in range(start, stop - self._seq_len + 1):
+ output.append(self._data[i : (i + self._seq_len)])
+ return np.stack(output)
  if idx >= len(self):
  raise IndexError(f"{idx} out of bound!")
  return self._data[idx : idx + self._seq_len]
@@ -161,7 +223,7 @@ def val_dataloader(self) -> Optional[EVAL_DATALOADERS]:
  def unbatch_sequences(batched: Tensor) -> Tensor:
  r"""
  Utility method to transform a 3D tensor of shape: (batch_size, seq_len, num_features)
- back into a shape of (new_batch, num_feautres).
+ back into a shape of (new_batch, num_features).
 
  Note: This is an approximate inverse transormation as only the
  first element in seq_len is used for the first (new_batch - seq_len - 1) rows.

diff --git a/tests/models/autoencoder/variants/test_conv.py b/tests/models/autoencoder/variants/test_conv.py
@@ -108,7 +108,7 @@ def test_conv1d_err(self):
  seq_len=SEQ_LEN,
  in_channels=self.X_train.shape[1],
  enc_channels=[8, 16, 4],
- enc_kernel_sizes={3, 3, 3},
+ enc_kernel_sizes={5, 3, 1},
  dec_activation="random",
  )
 

diff --git a/tests/tools/test_data.py b/tests/tools/test_data.py
@@ -8,7 +8,7 @@
 from torch.utils.data import DataLoader
 
 from numalogic._constants import TESTS_DIR
-from numalogic.tools.data import StreamingDataset, TimeseriesDataModule
+from numalogic.tools.data import StreamingDataset, TimeseriesDataModule, inverse_window
 from numalogic.tools.exceptions import InvalidDataShapeError
 
 ROOT_DIR = os.path.join(TESTS_DIR, "resources", "data")
@@ -96,7 +96,20 @@ def test_datamodule_err(self):
  with self.assertRaises(ValueError):
  TimeseriesDataModule(SEQ_LEN, self.train_data, val_split_ratio=1.2)
 
- def test_unbatch_sequences(self):
+
+class TestInverseWindow(unittest.TestCase):
+ train_data = None
+ test_data = None
+ m = None
+ n = None
+
+ @classmethod
+ def setUpClass(cls) -> None:
+ cls.n = 3
+ cls.train_data = RNG.random((100, cls.n))
+ cls.test_data = RNG.random((20, cls.n))
+
+ def test_inverse_window(self):
  ratio = 0.2
  datamodule = TimeseriesDataModule(
  SEQ_LEN, self.train_data, val_split_ratio=ratio, batch_size=256
@@ -106,19 +119,23 @@ def test_unbatch_sequences(self):
  val_size = int(ratio * len(self.train_data))
 
  for batch in datamodule.train_dataloader():
- unbatched = datamodule.unbatch_sequences(batch)
+ unbatched = inverse_window(batch, method="keep_first")
  self.assertTupleEqual(self.train_data[:-val_size].shape, unbatched.shape)
  self.assertAlmostEqual(
  torch.mean(unbatched).item(), np.mean(self.train_data[:-val_size]), places=5
  )
 
  for batch in datamodule.val_dataloader():
- unbatched = datamodule.unbatch_sequences(batch)
+ unbatched = inverse_window(batch, method="keep_last")
  self.assertTupleEqual(self.train_data[-val_size:].shape, unbatched.shape)
  self.assertAlmostEqual(
  torch.mean(unbatched).item(), np.mean(self.train_data[-val_size:]), places=5
  )
 
+ def test_inverse_window_err(self):
+ with self.assertRaises(ValueError):
+ inverse_window(torch.tensor([1, 2, 3]), method="invalid_method")
+
 
 if __name__ == "__main__":
  unittest.main()