fix: Fix deserialization of pipelines that contain LLMEvaluator sub…

…classes
deepset-ai · Jun 19, 2024 · 86ca7b9 · 86ca7b9
1 parent 96cda5d
commit 86ca7b9
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 8 deletions.
diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
@@ -6,9 +6,8 @@
 
 from numpy import mean as np_mean
 
-from haystack import default_from_dict
+from haystack import component, default_from_dict, default_to_dict
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
-from haystack.core.component import component
 from haystack.utils import Secret, deserialize_secrets_inplace
 
 # Private global variable for default examples to include in the prompt if the user does not provide any examples
@@ -34,6 +33,7 @@
 ]
 
 
+@component
 class ContextRelevanceEvaluator(LLMEvaluator):
  """
  Evaluator that checks if a provided context is relevant to the question.
@@ -115,7 +115,7 @@ def __init__(
  self.api = api
  self.api_key = api_key
 
- super().__init__(
+ super(ContextRelevanceEvaluator, self).__init__(
  instructions=self.instructions,
  inputs=self.inputs,
  outputs=self.outputs,
@@ -141,7 +141,7 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
  - `individual_scores`: A list of context relevance scores for each input question.
  - `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
  """
- result = super().run(questions=questions, contexts=contexts)
+ result = super(ContextRelevanceEvaluator, self).run(questions=questions, contexts=contexts)
 
  # calculate average statement relevance score per query
  for idx, res in enumerate(result["results"]):
@@ -159,6 +159,22 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
 
  return result
 
+ def to_dict(self) -> Dict[str, Any]:
+ """
+ Serialize this component to a dictionary.
+
+ :returns:
+ A dictionary with serialized data.
+ """
+ return default_to_dict(
+ self,
+ api=self.api,
+ api_key=self.api_key.to_dict() if self.api_key else None,
+ examples=self.examples,
+ progress_bar=self.progress_bar,
+ raise_on_failure=self.raise_on_failure,
+ )
+
  @classmethod
  def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
  """

diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
@@ -6,9 +6,8 @@
 
 from numpy import mean as np_mean
 
-from haystack import default_from_dict
+from haystack import component, default_from_dict, default_to_dict
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
-from haystack.core.component import component
 from haystack.utils import Secret, deserialize_secrets_inplace
 
 # Default examples to include in the prompt if the user does not provide any examples
@@ -46,6 +45,7 @@
 ]
 
 
+@component
 class FaithfulnessEvaluator(LLMEvaluator):
  """
  Evaluator that checks if a generated answer can be inferred from the provided contexts.
@@ -130,7 +130,7 @@ def __init__(
  self.api = api
  self.api_key = api_key
 
- super().__init__(
+ super(FaithfulnessEvaluator, self).__init__(
  instructions=self.instructions,
  inputs=self.inputs,
  outputs=self.outputs,
@@ -158,7 +158,9 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
  - `individual_scores`: A list of faithfulness scores for each input answer.
  - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
  """
- result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+ result = super(FaithfulnessEvaluator, self).run(
+ questions=questions, contexts=contexts, predicted_answers=predicted_answers
+ )
 
  # calculate average statement faithfulness score per query
  for idx, res in enumerate(result["results"]):
@@ -176,6 +178,22 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
 
  return result
 
+ def to_dict(self) -> Dict[str, Any]:
+ """
+ Serialize this component to a dictionary.
+
+ :returns:
+ A dictionary with serialized data.
+ """
+ return default_to_dict(
+ self,
+ api=self.api,
+ api_key=self.api_key.to_dict() if self.api_key else None,
+ examples=self.examples,
+ progress_bar=self.progress_bar,
+ raise_on_failure=self.raise_on_failure,
+ )
+
  @classmethod
  def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
  """

diff --git a/releasenotes/notes/fix-llmevaluator-subclass-deserialization-c633b2f95c84fe4b.yaml b/releasenotes/notes/fix-llmevaluator-subclass-deserialization-c633b2f95c84fe4b.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+ - |
+ Fix the deserialization of pipelines containing evaluator components that were subclasses of `LLMEvaluator`.
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -8,6 +8,7 @@
 
 import pytest
 
+from haystack import Pipeline
 from haystack.components.evaluators import ContextRelevanceEvaluator
 from haystack.utils.auth import Secret
 
@@ -71,6 +72,27 @@ def test_init_with_parameters(self):
  {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
  ]
 
+ def test_to_dict_with_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "test-api-key")
+ component = ContextRelevanceEvaluator(
+ api="openai",
+ api_key=Secret.from_env_var("ENV_VAR"),
+ examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
+ raise_on_failure=False,
+ progress_bar=False,
+ )
+ data = component.to_dict()
+ assert data == {
+ "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
+ "init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
+ "api": "openai",
+ "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
+ "progress_bar": False,
+ "raise_on_failure": False,
+ },
+ }
+
  def test_from_dict(self, monkeypatch):
  monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 
@@ -87,6 +109,10 @@ def test_from_dict(self, monkeypatch):
  assert component.generator.client.api_key == "test-api-key"
  assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
 
+ pipeline = Pipeline()
+ pipeline.add_component("evaluator", component)
+ assert pipeline.loads(pipeline.dumps())
+
  def test_run_calculates_mean_score(self, monkeypatch):
  monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
  component = ContextRelevanceEvaluator()

diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pytest
 
+from haystack import Pipeline
 from haystack.components.evaluators import FaithfulnessEvaluator
 from haystack.utils.auth import Secret
 
@@ -91,6 +92,31 @@ def test_init_with_parameters(self):
  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
  ]
 
+ def test_to_dict_with_parameters(self, monkeypatch):
+ monkeypatch.setenv("ENV_VAR", "test-api-key")
+ component = FaithfulnessEvaluator(
+ api="openai",
+ api_key=Secret.from_env_var("ENV_VAR"),
+ examples=[
+ {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+ ],
+ raise_on_failure=False,
+ progress_bar=False,
+ )
+ data = component.to_dict()
+ assert data == {
+ "type": "haystack.components.evaluators.faithfulness.FaithfulnessEvaluator",
+ "init_parameters": {
+ "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
+ "api": "openai",
+ "examples": [
+ {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
+ ],
+ "progress_bar": False,
+ "raise_on_failure": False,
+ },
+ }
+
  def test_from_dict(self, monkeypatch):
  monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 
@@ -111,6 +137,10 @@ def test_from_dict(self, monkeypatch):
  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
  ]
 
+ pipeline = Pipeline()
+ pipeline.add_component("evaluator", component)
+ assert pipeline.loads(pipeline.dumps())
+
  def test_run_calculates_mean_score(self, monkeypatch):
  monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
  component = FaithfulnessEvaluator()