examples: pipeline using block (#216)

Signed-off-by: Avik Basu <[email protected]>
numaproj · Jun 17, 2023 · 4831180 · 4831180
1 parent 203c100
commit 4831180
Show file tree

Hide file tree

Showing 27 changed files with 4,360 additions and 0 deletions.
diff --git a/...s/numalogic-simple-pipeline/.dockerignore → examples/block_pipeline/.dockerignore b/...s/numalogic-simple-pipeline/.dockerignore → examples/block_pipeline/.dockerignore
diff --git a/examples/block_pipeline/Dockerfile b/examples/block_pipeline/Dockerfile
@@ -0,0 +1,45 @@
+####################################################################################################
+# builder: install needed dependencies
+####################################################################################################
+
+FROM python:3.10-slim-bullseye AS builder
+
+ENV PYTHONFAULTHANDLER=1 \
+ PYTHONUNBUFFERED=1 \
+ PYTHONHASHSEED=random \
+ PIP_NO_CACHE_DIR=on \
+ PIP_DISABLE_PIP_VERSION_CHECK=on \
+ PIP_DEFAULT_TIMEOUT=100 \
+ PYSETUP_PATH="/opt/pysetup" \
+ VENV_PATH="/opt/pysetup/.venv"
+
+ENV PATH="$VENV_PATH/bin:$PATH"
+
+RUN apt-get update \
+ && apt-get install --no-install-recommends -y \
+ curl \
+ wget \
+ # deps for building python deps
+ build-essential \
+ && apt-get install -y git \
+ && apt-get clean && rm -rf /var/lib/apt/lists/* \
+ \
+ # install dumb-init
+ && wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \
+ && chmod +x /dumb-init
+
+####################################################################################################
+# udf: used for running the udf vertices
+####################################################################################################
+FROM builder AS udf
+
+WORKDIR $PYSETUP_PATH
+COPY ./requirements.txt ./
+RUN pip3 install -r requirements.txt
+
+ADD . /app
+WORKDIR /app
+
+ENTRYPOINT ["/dumb-init", "--"]
+
+EXPOSE 5000
diff --git a/examples/block_pipeline/numa-pl.yaml b/examples/block_pipeline/numa-pl.yaml
@@ -0,0 +1,96 @@
+apiVersion: numaflow.numaproj.io/v1alpha1
+kind: InterStepBufferService
+metadata:
+ name: redis-isbs # change it
+spec:
+ redis:
+ native:
+ version: 7.0.11
+ affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - podAffinityTerm:
+ labelSelector:
+ matchLabels:
+ app.kubernetes.io/component: isbsvc
+ numaflow.numaproj.io/isbsvc-name: redis-isbs # Change it
+ topologyKey: topology.kubernetes.io/zone
+ weight: 100
+ persistence:
+ accessMode: ReadWriteOnce
+ volumeSize: 1Gi
+ settings:
+ redis: |
+ maxmemory 4096mb
+
+
+---
+apiVersion: numaflow.numaproj.io/v1alpha1
+kind: Pipeline
+metadata:
+ name: blocks
+spec:
+ watermark:
+ disabled: false
+ limits:
+ readBatchSize: 10
+ bufferMaxLength: 500
+ bufferUsageLimit: 100
+ vertices:
+ - name: in
+ source:
+ http: {}
+ - name: inference
+ scale:
+ min: 1
+ udf:
+ container:
+ image: blockpl:v0.0.8
+ env:
+ - name: REDIS_AUTH
+ valueFrom:
+ secretKeyRef:
+ name: isbsvc-redis-isbs-redis-auth
+ key: redis-password
+ args:
+ - python
+ - server.py
+ - inference
+ - name: train
+ scale:
+ min: 1
+ udf:
+ container:
+ image: blockpl:v0.0.8
+ env:
+ - name: REDIS_AUTH
+ valueFrom:
+ secretKeyRef:
+ name: isbsvc-redis-isbs-redis-auth
+ key: redis-password
+ args:
+ - python
+ - server.py
+ - train
+ - name: out
+ scale:
+ min: 1
+ sink:
+ log: {}
+ edges:
+ - from: in
+ to: inference
+ - conditions:
+ tags:
+ operator: or
+ values:
+ - train
+ from: inference
+ to: train
+ - from: inference
+ to: out
+ conditions:
+ tags:
+ operator: or
+ values:
+ - out
diff --git a/examples/block_pipeline/requirements.txt b/examples/block_pipeline/requirements.txt
@@ -0,0 +1,4 @@
+cachetools>5.2,<6.0
+numalogic[redis,numaflow] @ git+https://github.com/numaproj/numalogic.git@main
+# ../../../numalogic[redis,numaflow] # for local testing
+pytorch-lightning>2.0,< 3.0
diff --git a/...ple-pipeline/src/resources/train_data.csv → ...s/block_pipeline/resources/train_data.csv b/...ple-pipeline/src/resources/train_data.csv → ...s/block_pipeline/resources/train_data.csv
diff --git a/examples/block_pipeline/server.py b/examples/block_pipeline/server.py
@@ -0,0 +1,20 @@
+import sys
+
+from pynumaflow.function import Server
+from src import Inference, Train
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ raise ValueError("Please provide a step name")
+
+ step = sys.argv[1]
+ if step == "inference":
+ step_handler = Inference()
+ elif step == "train":
+ step_handler = Train()
+ else:
+ raise ValueError(f"Invalid step provided: {step}")
+
+ grpc_server = Server(map_handler=step_handler)
+ grpc_server.start()
diff --git a/examples/block_pipeline/src/__init__.py b/examples/block_pipeline/src/__init__.py
@@ -0,0 +1,19 @@
+import logging
+
+from src.inference import Inference
+from src.train import Train
+
+logging.basicConfig(level=logging.INFO)
+LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel(logging.INFO)
+
+stream_handler = logging.StreamHandler()
+stream_handler.setLevel(logging.INFO)
+
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+stream_handler.setFormatter(formatter)
+
+LOGGER.addHandler(stream_handler)
+
+
+__all__ = ["Inference", "Train"]
diff --git a/examples/block_pipeline/src/inference.py b/examples/block_pipeline/src/inference.py
@@ -0,0 +1,64 @@
+import json
+import logging
+
+import numpy as np
+from numalogic.blocks import (
+ BlockPipeline,
+ PreprocessBlock,
+ NNBlock,
+ ThresholdBlock,
+ PostprocessBlock,
+)
+from numalogic.models.autoencoder.variants import SparseVanillaAE
+from numalogic.models.threshold import StdDevThreshold
+from numalogic.numaflow import NumalogicUDF
+from numalogic.registry import RedisRegistry
+from numalogic.tools.exceptions import RedisRegistryError
+from numalogic.transforms import TanhNorm
+from pynumaflow.function import Messages, Datum, Message
+from sklearn.preprocessing import StandardScaler
+
+from src.utils import RedisClient
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class Inference(NumalogicUDF):
+ """UDF to preprocess the input data for ML inference."""
+
+ def __init__(self, seq_len: int = 12, num_series: int = 1):
+ super().__init__()
+ self.seq_len = seq_len
+ self.n_features = num_series
+ self.registry = RedisRegistry(client=RedisClient().get_client())
+
+ def exec(self, keys: list[str], datum: Datum) -> Messages:
+ # Load json data
+ series = json.loads(datum.value)["data"]
+
+ block_pl = BlockPipeline(
+ PreprocessBlock(StandardScaler()),
+ NNBlock(
+ SparseVanillaAE(seq_len=self.seq_len, n_features=self.n_features), self.seq_len
+ ),
+ ThresholdBlock(StdDevThreshold()),
+ PostprocessBlock(TanhNorm()),
+ registry=self.registry,
+ )
+
+ # Load the model from the registry
+ try:
+ block_pl.load(skeys=["blockpl"], dkeys=["sparsevanillae"])
+ except RedisRegistryError as warn:
+ _LOGGER.warning("Error loading block pipeline: %r", warn)
+ return Messages(Message(value=b"", tags=["train"]))
+
+ # Run inference
+ try:
+ output = block_pl(np.asarray(series).reshape(-1, self.n_features))
+ except Exception as err:
+ _LOGGER.error("Error running block pipeline: %r", err)
+ return Messages(Message.to_drop())
+
+ anomaly_score = np.mean(output)
+ return Messages(Message(tags=["out"], value=json.dumps({"score": anomaly_score}).encode()))
diff --git a/examples/block_pipeline/src/train.py b/examples/block_pipeline/src/train.py
@@ -0,0 +1,64 @@
+import logging
+
+import pandas as pd
+from cachetools import TTLCache
+from numalogic.blocks import (
+ BlockPipeline,
+ PreprocessBlock,
+ NNBlock,
+ ThresholdBlock,
+ PostprocessBlock,
+)
+from numalogic.models.autoencoder.variants import SparseVanillaAE
+from numalogic.models.threshold import StdDevThreshold
+from numalogic.numaflow import NumalogicUDF
+from numalogic.registry import RedisRegistry
+from numalogic.transforms import TanhNorm
+from pynumaflow.function import Datum, Messages, Message
+from sklearn.preprocessing import StandardScaler
+
+from src.utils import RedisClient, TRAIN_DATA_PATH
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class Train(NumalogicUDF):
+ """UDF to train the model and save it to the registry."""
+
+ ttl_cache = TTLCache(maxsize=16, ttl=60)
+
+ def __init__(self, seq_len: int = 12, num_series: int = 1):
+ super().__init__()
+ self.seq_len = seq_len
+ self.n_features = num_series
+ self.registry = RedisRegistry(client=RedisClient().get_client())
+ self._model_key = "sparsevanillae"
+
+ def exec(self, keys: list[str], datum: Datum) -> Messages:
+ """The train function here trains the model and saves it to the registry."""
+ # Check if a training message has been received very recently
+ if self._model_key in self.ttl_cache:
+ return Messages(Message.to_drop())
+ self.ttl_cache[self._model_key] = self._model_key
+
+ # Load Training data
+ data = pd.read_csv(TRAIN_DATA_PATH, index_col=None)
+
+ # Define the block pipeline
+ block_pl = BlockPipeline(
+ PreprocessBlock(StandardScaler()),
+ NNBlock(
+ SparseVanillaAE(seq_len=self.seq_len, n_features=self.n_features), self.seq_len
+ ),
+ ThresholdBlock(StdDevThreshold()),
+ PostprocessBlock(TanhNorm()),
+ registry=self.registry,
+ )
+ block_pl.fit(data)
+
+ # Save the model to the registry
+ block_pl.save(skeys=["blockpl"], dkeys=["sparsevanillae"])
+ _LOGGER.info("Model saved to registry")
+
+ # Train vertex is the last vertex in the pipeline
+ return Messages(Message.to_drop())
diff --git a/examples/block_pipeline/src/utils.py b/examples/block_pipeline/src/utils.py
@@ -0,0 +1,42 @@
+import logging
+import os
+
+from redis.sentinel import Sentinel
+
+from numalogic.tools.types import Singleton, redis_client_t
+
+_LOGGER = logging.getLogger(__name__)
+_DIR = os.path.dirname(__file__)
+_ROOT_DIR = os.path.split(_DIR)[0]
+TRAIN_DATA_PATH = os.path.join(_ROOT_DIR, "resources/train_data.csv")
+
+AUTH = os.getenv("REDIS_AUTH")
+HOST = os.getenv("REDIS_HOST", default="isbsvc-redis-isbs-redis-svc.default.svc")
+PORT = os.getenv("REDIS_PORT", default="26379")
+MASTERNAME = os.getenv("REDIS_MASTER_NAME", default="mymaster")
+
+
+class RedisClient(metaclass=Singleton):
+ """Singleton class to manage redis client."""
+
+ _client: redis_client_t = None
+
+ def __init__(self):
+ if not self._client:
+ self.set_client()
+
+ def set_client(self) -> None:
+ sentinel_args = {
+ "sentinels": [(HOST, PORT)],
+ "socket_timeout": 0.1,
+ }
+ _LOGGER.info("Connecting to redis sentinel: %s, %s, %s", sentinel_args, MASTERNAME, AUTH)
+ sentinel = Sentinel(
+ **sentinel_args,
+ sentinel_kwargs=dict(password=AUTH),
+ password=AUTH,
+ )
+ self._client = sentinel.master_for(MASTERNAME)
+
+ def get_client(self) -> redis_client_t:
+ return self._client
diff --git a/examples/multi_udf/.dockerignore b/examples/multi_udf/.dockerignore
@@ -0,0 +1,6 @@
+.git/
+__pycache__/
+**/__pycache__/
+*.py[cod]
+*$py.class
+.idea/
diff --git a/...ples/numalogic-simple-pipeline/Dockerfile → examples/multi_udf/Dockerfile b/...ples/numalogic-simple-pipeline/Dockerfile → examples/multi_udf/Dockerfile
diff --git a/examples/numalogic-simple-pipeline/README.md → examples/multi_udf/README.md b/examples/numalogic-simple-pipeline/README.md → examples/multi_udf/README.md
diff --git a/examples/numalogic-simple-pipeline/entry.sh → examples/multi_udf/entry.sh b/examples/numalogic-simple-pipeline/entry.sh → examples/multi_udf/entry.sh
diff --git a/...es/numalogic-simple-pipeline/numa-pl.yaml → examples/multi_udf/numa-pl.yaml b/...es/numalogic-simple-pipeline/numa-pl.yaml → examples/multi_udf/numa-pl.yaml
diff --git a/...umalogic-simple-pipeline/requirements.txt → examples/multi_udf/requirements.txt b/...umalogic-simple-pipeline/requirements.txt → examples/multi_udf/requirements.txt
diff --git a/...numalogic-simple-pipeline/src/__init__.py → examples/multi_udf/src/__init__.py b/...numalogic-simple-pipeline/src/__init__.py → examples/multi_udf/src/__init__.py
diff --git a/.../numalogic-simple-pipeline/src/factory.py → examples/multi_udf/src/factory.py b/.../numalogic-simple-pipeline/src/factory.py → examples/multi_udf/src/factory.py