Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix deserialization of pipelines that contain LLMEvaluator subclasses #7891

Merged
merged 1 commit into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

from numpy import mean as np_mean

from haystack import default_from_dict
from haystack import component, default_from_dict, default_to_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Private global variable for default examples to include in the prompt if the user does not provide any examples
Expand All @@ -34,6 +33,7 @@
]


@component
class ContextRelevanceEvaluator(LLMEvaluator):
"""
Evaluator that checks if a provided context is relevant to the question.
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(
self.api = api
self.api_key = api_key

super().__init__(
super(ContextRelevanceEvaluator, self).__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
Expand All @@ -141,7 +141,7 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]
- `individual_scores`: A list of context relevance scores for each input question.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
"""
result = super().run(questions=questions, contexts=contexts)
result = super(ContextRelevanceEvaluator, self).run(questions=questions, contexts=contexts)

# calculate average statement relevance score per query
for idx, res in enumerate(result["results"]):
Expand All @@ -159,6 +159,22 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]

return result

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.

:returns:
A dictionary with serialized data.
"""
return default_to_dict(
self,
api=self.api,
api_key=self.api_key.to_dict() if self.api_key else None,
examples=self.examples,
progress_bar=self.progress_bar,
raise_on_failure=self.raise_on_failure,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
"""
Expand Down
26 changes: 22 additions & 4 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

from numpy import mean as np_mean

from haystack import default_from_dict
from haystack import component, default_from_dict, default_to_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Default examples to include in the prompt if the user does not provide any examples
Expand Down Expand Up @@ -46,6 +45,7 @@
]


@component
class FaithfulnessEvaluator(LLMEvaluator):
"""
Evaluator that checks if a generated answer can be inferred from the provided contexts.
Expand Down Expand Up @@ -130,7 +130,7 @@ def __init__(
self.api = api
self.api_key = api_key

super().__init__(
super(FaithfulnessEvaluator, self).__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
Expand Down Expand Up @@ -158,7 +158,9 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
result = super(FaithfulnessEvaluator, self).run(
questions=questions, contexts=contexts, predicted_answers=predicted_answers
)

# calculate average statement faithfulness score per query
for idx, res in enumerate(result["results"]):
Expand All @@ -176,6 +178,22 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers

return result

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.

:returns:
A dictionary with serialized data.
"""
return default_to_dict(
self,
api=self.api,
api_key=self.api_key.to_dict() if self.api_key else None,
examples=self.examples,
progress_bar=self.progress_bar,
raise_on_failure=self.raise_on_failure,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "FaithfulnessEvaluator":
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Fix the deserialization of pipelines containing evaluator components that were subclasses of `LLMEvaluator`.
26 changes: 26 additions & 0 deletions test/components/evaluators/test_context_relevance_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pytest

from haystack import Pipeline
from haystack.components.evaluators import ContextRelevanceEvaluator
from haystack.utils.auth import Secret

Expand Down Expand Up @@ -71,6 +72,27 @@ def test_init_with_parameters(self):
{"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]

def test_to_dict_with_parameters(self, monkeypatch):
monkeypatch.setenv("ENV_VAR", "test-api-key")
component = ContextRelevanceEvaluator(
api="openai",
api_key=Secret.from_env_var("ENV_VAR"),
examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
raise_on_failure=False,
progress_bar=False,
)
data = component.to_dict()
assert data == {
"type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
"init_parameters": {
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
"progress_bar": False,
"raise_on_failure": False,
},
}

def test_from_dict(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")

Expand All @@ -87,6 +109,10 @@ def test_from_dict(self, monkeypatch):
assert component.generator.client.api_key == "test-api-key"
assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]

pipeline = Pipeline()
pipeline.add_component("evaluator", component)
assert pipeline.loads(pipeline.dumps())

def test_run_calculates_mean_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator()
Expand Down
30 changes: 30 additions & 0 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pytest

from haystack import Pipeline
from haystack.components.evaluators import FaithfulnessEvaluator
from haystack.utils.auth import Secret

Expand Down Expand Up @@ -91,6 +92,31 @@ def test_init_with_parameters(self):
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]

def test_to_dict_with_parameters(self, monkeypatch):
monkeypatch.setenv("ENV_VAR", "test-api-key")
component = FaithfulnessEvaluator(
api="openai",
api_key=Secret.from_env_var("ENV_VAR"),
examples=[
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
raise_on_failure=False,
progress_bar=False,
)
data = component.to_dict()
assert data == {
"type": "haystack.components.evaluators.faithfulness.FaithfulnessEvaluator",
"init_parameters": {
"api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
"progress_bar": False,
"raise_on_failure": False,
},
}

def test_from_dict(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")

Expand All @@ -111,6 +137,10 @@ def test_from_dict(self, monkeypatch):
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
]

pipeline = Pipeline()
pipeline.add_component("evaluator", component)
assert pipeline.loads(pipeline.dumps())

def test_run_calculates_mean_score(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()
Expand Down
Loading