feat: Add example for Optimum integration, fix docs, CI

deepset-ai · Mar 4, 2024 · fd432a3 · fd432a3
1 parent 7a1e118
commit fd432a3
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 97 deletions.
diff --git a/.github/workflows/optimum.yml b/.github/workflows/optimum.yml
@@ -52,9 +52,9 @@ jobs:
  if: matrix.python-version == '3.9' && runner.os == 'Linux'
  run: hatch run lint:all
 
- # - name: Generate docs
- #  if: matrix.python-version == '3.9' && runner.os == 'Linux'
- #  run: hatch run docs
+ - name: Generate docs
+ if: matrix.python-version == '3.9' && runner.os == 'Linux'
+ run: hatch run docs
 
  - name: Run tests
  run: hatch run cov
diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [nvidia-haystack](integrations/nvidia/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack) | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml) |
 | [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) |
 | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) |
+| [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) |
 | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) |
 | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) |
 | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) |

diff --git a/integrations/optimum/README.md b/integrations/optimum/README.md
@@ -1,30 +1,24 @@
 # optimum
 
-[![PyPI - Version](https://img.shields.io/pypi/v/optimum.svg)](https://pypi.org/project/optimum-haystack)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum.svg)](https://pypi.org/project/optimum-haystack)
+[![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack)
 
------
+---
 
 Component to embed strings and Documents using models loaded with the HuggingFace Optimum library. This component is designed to seamlessly inference models using the high speed ONNX runtime.
 
 **Table of Contents**
 
-- [Installation](#installation)
-- [License](#license)
+- [optimum](#optimum)
+ - [Installation](#installation)
+ - [License](#license)
 
 ## Installation
 
-To use the ONNX runtime for CPU, use the CPU version:
 ```console
-pip install optimum-haystack[cpu]
+pip install optimum-haystack
 ```
 
-For using the GPU runtimes:
-```console
-pip install optimum-haystack[gpu]
-```
-
-
 ## License
 
 `optimum-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
diff --git a/integrations/optimum/example/example.py b/integrations/optimum/example/example.py
@@ -0,0 +1,34 @@
+# This example requires GPU support to execute.
+
+from haystack import Pipeline
+
+from haystack_integrations.components.embedders.optimum import (
+ OptimumTextEmbedder,
+ OptimumEmbedderPooling,
+ OptimumEmbedderOptimizationConfig,
+ OptimumEmbedderOptimizationMode,
+)
+
+pipeline = Pipeline()
+embedder = OptimumTextEmbedder(
+ model="intfloat/e5-base-v2",
+ normalize_embeddings=True,
+ onnx_execution_provider="CUDAExecutionProvider",
+ optimizer_settings=OptimumEmbedderOptimizationConfig(
+ mode=OptimumEmbedderOptimizationMode.O4,
+ for_gpu=True,
+ ),
+ working_dir="/tmp/optimum",
+ pooling_mode=OptimumEmbedderPooling.MEAN,
+)
+pipeline.add_component("embedder", embedder)
+
+results = pipeline.run(
+ {
+ "embedder": {
+ "text": "Ex profunditate antiquae doctrinae, Ad caelos supra semper, Hoc incantamentum evoco, draco apparet, Incantamentum iam transactum est"
+ },
+ }
+)
+
+print(results["embedder"]["embedding"])
diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/optimization.py
@@ -7,7 +7,7 @@
 
 class OptimumEmbedderOptimizationMode(Enum):
  """
- [ONXX Optimization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html)
+ [ONXX Optimization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization.html)
  support by the Optimum Embedders.
  """
 

diff --git a/...timum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py b/...timum/src/haystack_integrations/components/embedders/optimum/optimum_document_embedder.py
@@ -70,41 +70,42 @@ def __init__(
  The [execution provider](https://onnxruntime.ai/docs/execution-providers/)
  to use for ONNX models.
 
- Note: Using the TensorRT execution provider
- TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model
- optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime
- provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We
- recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT
- execution provider. The usage is as follows:
- ```python
- embedder = OptimumDocumentEmbedder(
- model="sentence-transformers/all-mpnet-base-v2",
- onnx_execution_provider="TensorrtExecutionProvider",
- model_kwargs={
- "provider_options": {
- "trt_engine_cache_enable": True,
- "trt_engine_cache_path": "tmp/trt_cache",
- }
- },
- )
- ```
+ Note: Using the TensorRT execution provider
+ TensorRT requires to build its inference engine ahead of inference,
+ which takes some time due to the model optimization and nodes fusion.
+ To avoid rebuilding the engine every time the model is loaded, ONNX
+ Runtime provides a pair of options to save the engine: `trt_engine_cache_enable`
+ and `trt_engine_cache_path`. We recommend setting these two provider
+ options using the model_kwargs parameter, when using the TensorRT execution provider.
+ The usage is as follows:
+ ```python
+ embedder = OptimumDocumentEmbedder(
+ model="sentence-transformers/all-mpnet-base-v2",
+ onnx_execution_provider="TensorrtExecutionProvider",
+ model_kwargs={
+ "provider_options": {
+ "trt_engine_cache_enable": True,
+ "trt_engine_cache_path": "tmp/trt_cache",
+ }
+ },
+ )
+ ```
  :param pooling_mode:
  The pooling mode to use. When `None`, pooling mode will be inferred from the model config.
  :param model_kwargs:
  Dictionary containing additional keyword arguments to pass to the model.
  In case of duplication, these kwargs override `model`, `onnx_execution_provider`
  and `token` initialization parameters.
- :param working_dir:
- The directory to use for storing intermediate files
- generated during model optimization/quantization.
-
- Required for optimization and quantization.
- :param optimizer_settings:
- Configuration for Optimum Embedder Optimization.
- If `None`, no additional optimization is be applied.
- :param quantizer_settings:
- Configuration for Optimum Embedder Quantization.
- If `None`, no quantization is be applied.
+ :param working_dir:
+ The directory to use for storing intermediate files
+ generated during model optimization/quantization. Required
+ for optimization and quantization.
+ :param optimizer_settings:
+ Configuration for Optimum Embedder Optimization.
+ If `None`, no additional optimization is be applied.
+ :param quantizer_settings:
+ Configuration for Optimum Embedder Quantization.
+ If `None`, no quantization is be applied.
  :param batch_size:
  Number of Documents to encode at once.
  :param progress_bar:
@@ -199,6 +200,10 @@ def run(self, documents: List[Document]):
  A list of Documents to embed.
  :returns:
  The updated Documents with their embeddings.
+ :raises RuntimeError:
+ If the component was not initialized.
+ :raises TypeError:
+ If the input is not a list of Documents.
  """
  if not self._initialized:
  msg = "The embedding model has not been loaded. Please call warm_up() before running."

diff --git a/...s/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py b/...s/optimum/src/haystack_integrations/components/embedders/optimum/optimum_text_embedder.py
@@ -46,57 +46,58 @@ def __init__(
  quantizer_settings: Optional[OptimumEmbedderQuantizationConfig] = None,
  ):
  """
-  Create a OptimumTextEmbedder component.
+ Create a OptimumTextEmbedder component.
 
  :param model:
- A string representing the model id on HF Hub.
- :param token:
- The HuggingFace token to use as HTTP bearer authorization.
- :param prefix:
- A string to add to the beginning of each text.
- :param suffix:
- A string to add to the end of each text.
- :param normalize_embeddings:
- Whether to normalize the embeddings to unit length.
- :param onnx_execution_provider:
- The [execution provider](https://onnxruntime.ai/docs/execution-providers/)
- to use for ONNX models.
-
- Note: Using the TensorRT execution provider
- TensorRT requires to build its inference engine ahead of inference, which takes some time due to the model
- optimization and nodes fusion. To avoid rebuilding the engine every time the model is loaded, ONNX Runtime
- provides a pair of options to save the engine: `trt_engine_cache_enable` and `trt_engine_cache_path`. We
- recommend setting these two provider options using the model_kwargs parameter, when using the TensorRT
- execution provider. The usage is as follows:
- ```python
- embedder = OptimumDocumentEmbedder(
- model="sentence-transformers/all-mpnet-base-v2",
- onnx_execution_provider="TensorrtExecutionProvider",
- model_kwargs={
- "provider_options": {
- "trt_engine_cache_enable": True,
- "trt_engine_cache_path": "tmp/trt_cache",
- }
- },
- )
- ```
- :param pooling_mode:
- The pooling mode to use. When `None`, pooling mode will be inferred from the model config.
- :param model_kwargs:
- Dictionary containing additional keyword arguments to pass to the model.
- In case of duplication, these kwargs override `model`, `onnx_execution_provider`
- and `token` initialization parameters.
- :param working_dir:
- The directory to use for storing intermediate files
- generated during model optimization/quantization.
-
- Required for optimization and quantization.
- :param optimizer_settings:
- Configuration for Optimum Embedder Optimization.
- If `None`, no additional optimization is applied.
- :param quantizer_settings:
- Configuration for Optimum Embedder Quantization.
- If `None`, no quantization is applied.
+ A string representing the model id on HF Hub.
+ :param token:
+ The HuggingFace token to use as HTTP bearer authorization.
+ :param prefix:
+ A string to add to the beginning of each text.
+ :param suffix:
+ A string to add to the end of each text.
+ :param normalize_embeddings:
+ Whether to normalize the embeddings to unit length.
+ :param onnx_execution_provider:
+ The [execution provider](https://onnxruntime.ai/docs/execution-providers/)
+ to use for ONNX models.
+
+ Note: Using the TensorRT execution provider
+ TensorRT requires to build its inference engine ahead of inference,
+ which takes some time due to the model optimization and nodes fusion.
+ To avoid rebuilding the engine every time the model is loaded, ONNX
+ Runtime provides a pair of options to save the engine: `trt_engine_cache_enable`
+ and `trt_engine_cache_path`. We recommend setting these two provider
+ options using the model_kwargs parameter, when using the TensorRT execution provider.
+ The usage is as follows:
+ ```python
+ embedder = OptimumDocumentEmbedder(
+ model="sentence-transformers/all-mpnet-base-v2",
+ onnx_execution_provider="TensorrtExecutionProvider",
+ model_kwargs={
+ "provider_options": {
+ "trt_engine_cache_enable": True,
+ "trt_engine_cache_path": "tmp/trt_cache",
+ }
+ },
+ )
+ ```
+ :param pooling_mode:
+ The pooling mode to use. When `None`, pooling mode will be inferred from the model config.
+ :param model_kwargs:
+ Dictionary containing additional keyword arguments to pass to the model.
+ In case of duplication, these kwargs override `model`, `onnx_execution_provider`
+ and `token` initialization parameters.
+ :param working_dir:
+ The directory to use for storing intermediate files
+ generated during model optimization/quantization. Required
+ for optimization and quantization.
+ :param optimizer_settings:
+ Configuration for Optimum Embedder Optimization.
+ If `None`, no additional optimization is be applied.
+ :param quantizer_settings:
+ Configuration for Optimum Embedder Quantization.
+ If `None`, no quantization is be applied.
  """
  params = _EmbedderParams(
  model=model,
@@ -161,6 +162,10 @@ def run(self, text: str):
  The text to embed.
  :returns:
  The embeddings of the text.
+ :raises RuntimeError:
+ If the component was not initialized.
+ :raises TypeError:
+ If the input is not a string.
  """
  if not self._initialized:
  msg = "The embedding model has not been loaded. Please call warm_up() before running."

diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/pooling.py
@@ -3,7 +3,7 @@
 
 class OptimumEmbedderPooling(Enum):
  """
- Pooling Modes support by the Optimum Embedders.
+ Pooling modes support by the Optimum Embedders.
  """
 
  #: Perform CLS Pooling on the output of the embedding model

diff --git a/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py b/integrations/optimum/src/haystack_integrations/components/embedders/optimum/quantization.py
@@ -7,7 +7,7 @@
 
 class OptimumEmbedderQuantizationMode(Enum):
  """
- [Dynamic Quantization Modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization)
+ [Dynamic Quantization modes](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization)
  support by the Optimum Embedders.
  """