Skip to content

Commit

Permalink
fix: Fix deserialization of pipelines that contain LLMEvaluator sub…
Browse files Browse the repository at this point in the history
…classes
  • Loading branch information
shadeMe committed Jun 19, 2024
1 parent 96cda5d commit 86ca7b9
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 8 deletions.
24 changes: 20 additions & 4 deletions haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

from numpy import mean as np_mean

from haystack import default_from_dict
from haystack import component, default_from_dict, default_to_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Private global variable for default examples to include in the prompt if the user does not provide any examples
Expand All @@ -34,6 +33,7 @@
]


@component
class ContextRelevanceEvaluator(LLMEvaluator):
"""
Evaluator that checks if a provided context is relevant to the question.
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(
self.api = api
self.api_key = api_key

super().__init__(
super(ContextRelevanceEvaluator, self).__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
Expand All @@ -141,7 +141,7 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
- `individual_scores`: A list of context relevance scores for each input question.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
"""
result = super().run(questions=questions, contexts=contexts)
result = super(ContextRelevanceEvaluator, self).run(questions=questions, contexts=contexts)

# calculate average statement relevance score per query
for idx, res in enumerate(result["results"]):
Expand All @@ -159,6 +159,22 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]

return result

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
:returns:
A dictionary with serialized data.
"""
return default_to_dict(
self,
api=self.api,
api_key=self.api_key.to_dict() if self.api_key else None,
examples=self.examples,
progress_bar=self.progress_bar,
raise_on_failure=self.raise_on_failure,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
"""
Expand Down
26 changes: 22 additions & 4 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

from numpy import mean as np_mean

from haystack import default_from_dict
from haystack import component, default_from_dict, default_to_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Default examples to include in the prompt if the user does not provide any examples
Expand Down Expand Up @@ -46,6 +45,7 @@
]


@component
class FaithfulnessEvaluator(LLMEvaluator):
"""
Evaluator that checks if a generated answer can be inferred from the provided contexts.
Expand Down Expand Up @@ -130,7 +130,7 @@ def __init__(
self.api = api
self.api_key = api_key

super().__init__(
super(FaithfulnessEvaluator, self).__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
Expand Down Expand Up @@ -158,7 +158,9 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
result = super(FaithfulnessEvaluator, self).run(
questions=questions, contexts=contexts, predicted_answers=predicted_answers
)

# calculate average statement faithfulness score per query
for idx, res in enumerate(result["results"]):
Expand All @@ -176,6 +178,22 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers

return result

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
:returns:
A dictionary with serialized data.
"""
return default_to_dict(
self,
api=self.api,
api_key=self.api_key.to_dict() if self.api_key else None,
examples=self.examples,
progress_bar=self.progress_bar,
raise_on_failure=self.raise_on_failure,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Fix the deserialization of pipelines containing evaluator components that were subclasses of `LLMEvaluator`.
26 changes: 26 additions & 0 deletions test/components/evaluators/test_context_relevance_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pytest

from haystack import Pipeline
from haystack.components.evaluators import ContextRelevanceEvaluator
from haystack.utils.auth import Secret

Expand Down Expand Up @@ -71,6 +72,27 @@ def test_init_with_parameters(self):
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]

def test_to_dict_with_parameters(self, monkeypatch):
monkeypatch.setenv("ENV_VAR", "test-api-key")
component = ContextRelevanceEvaluator(
api="openai",
api_key=Secret.from_env_var("ENV_VAR"),
examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
raise_on_failure=False,
progress_bar=False,
)
data = component.to_dict()
assert data == {
"type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
"init_parameters": {
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
"progress_bar": False,
"raise_on_failure": False,
},
}

def test_from_dict(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")

Expand All @@ -87,6 +109,10 @@ def test_from_dict(self, monkeypatch):
assert component.generator.client.api_key == "test-api-key"
assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]

pipeline = Pipeline()
pipeline.add_component("evaluator", component)
assert pipeline.loads(pipeline.dumps())

def test_run_calculates_mean_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
Expand Down
30 changes: 30 additions & 0 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pytest

from haystack import Pipeline
from haystack.components.evaluators import FaithfulnessEvaluator
from haystack.utils.auth import Secret

Expand Down Expand Up @@ -91,6 +92,31 @@ def test_init_with_parameters(self):
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]

def test_to_dict_with_parameters(self, monkeypatch):
monkeypatch.setenv("ENV_VAR", "test-api-key")
component = FaithfulnessEvaluator(
api="openai",
api_key=Secret.from_env_var("ENV_VAR"),
examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
raise_on_failure=False,
progress_bar=False,
)
data = component.to_dict()
assert data == {
"type": "haystack.components.evaluators.faithfulness.FaithfulnessEvaluator",
"init_parameters": {
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
"progress_bar": False,
"raise_on_failure": False,
},
}

def test_from_dict(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")

Expand All @@ -111,6 +137,10 @@ def test_from_dict(self, monkeypatch):
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
]

pipeline = Pipeline()
pipeline.add_component("evaluator", component)
assert pipeline.loads(pipeline.dumps())

def test_run_calculates_mean_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()
Expand Down

0 comments on commit 86ca7b9

Please sign in to comment.