Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Correct parameter name for faithfulness evaluator and fix component names passed to PipelinePair #13

Merged
merged 1 commit into from
Jun 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix: Correct parameter name for faithfulness evaluator and fix compon…
…ent names passed to `PipelinePair`
  • Loading branch information
shadeMe committed Jun 14, 2024
commit 59d5ac8e6c1a5342c0258a9d65c4e8d33ec95f61
100 changes: 75 additions & 25 deletions haystack_experimental/evaluation/harness/rag/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
)


class RAGEvaluationHarness(EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput]):
class RAGEvaluationHarness(
EvaluationHarness[RAGEvaluationInput, RAGEvaluationOverrides, RAGEvaluationOutput]
):
"""
Evaluation harness for evaluating RAG pipelines.
"""
Expand Down Expand Up @@ -167,7 +169,9 @@ def _lookup_component_output(
output_name = mapping[output_name]
return outputs[name][output_name]

def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverrides]) -> PipelinePair:
def _generate_eval_run_pipelines(
self, overrides: Optional[RAGEvaluationOverrides]
) -> PipelinePair:
if overrides is None:
rag_overrides = None
eval_overrides = None
Expand All @@ -178,7 +182,9 @@ def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverride
if eval_overrides is not None:
for metric in eval_overrides.keys():
if metric not in self.metrics:
raise ValueError(f"Cannot override parameters of unused evaluation metric '{metric.value}'")
raise ValueError(
f"Cannot override parameters of unused evaluation metric '{metric.value}'"
)

eval_overrides = {k.value: v for k, v in eval_overrides.items()} # type: ignore

Expand All @@ -193,18 +199,26 @@ def _generate_eval_run_pipelines(self, overrides: Optional[RAGEvaluationOverride
x
),
included_first_outputs={
RAGExpectedComponent.DOCUMENT_RETRIEVER.value,
RAGExpectedComponent.RESPONSE_GENERATOR.value,
self.rag_components[RAGExpectedComponent.DOCUMENT_RETRIEVER].name,
self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name,
},
)

def _aggregate_rag_outputs(self, outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Dict[str, Any]]:
def _aggregate_rag_outputs(
self, outputs: List[Dict[str, Dict[str, Any]]]
) -> Dict[str, Dict[str, Any]]:
aggregate = aggregate_batched_pipeline_outputs(outputs)

# We only care about the first response from the generator.
generator_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].name
replies_output_name = self.rag_components[RAGExpectedComponent.RESPONSE_GENERATOR].output_mapping["replies"]
aggregate[generator_name][replies_output_name] = [r[0] for r in aggregate[generator_name][replies_output_name]]
generator_name = self.rag_components[
RAGExpectedComponent.RESPONSE_GENERATOR
].name
replies_output_name = self.rag_components[
RAGExpectedComponent.RESPONSE_GENERATOR
].output_mapping["replies"]
aggregate[generator_name][replies_output_name] = [
r[0] for r in aggregate[generator_name][replies_output_name]
]

return aggregate

Expand Down Expand Up @@ -247,7 +261,10 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:
RAGExpectedComponent.DOCUMENT_RETRIEVER,
"retrieved_documents",
),
"responses": (RAGExpectedComponent.RESPONSE_GENERATOR, "replies"),
"predicted_answers": (
RAGExpectedComponent.RESPONSE_GENERATOR,
"replies",
),
},
}

Expand All @@ -266,9 +283,15 @@ def _map_rag_eval_pipeline_io(self) -> Dict[str, List[str]]:

return outputs_to_inputs

def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[str, Dict[str, Any]]]:
query_embedder_name = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].name
query_embedder_text_input = self.rag_components[RAGExpectedComponent.QUERY_PROCESSOR].input_mapping["query"]
def _prepare_rag_pipeline_inputs(
self, inputs: RAGEvaluationInput
) -> List[Dict[str, Dict[str, Any]]]:
query_embedder_name = self.rag_components[
RAGExpectedComponent.QUERY_PROCESSOR
].name
query_embedder_text_input = self.rag_components[
RAGExpectedComponent.QUERY_PROCESSOR
].input_mapping["query"]

if inputs.additional_rag_inputs is not None:
# Ensure that the query embedder input is not provided as additional input.
Expand All @@ -284,14 +307,22 @@ def _prepare_rag_pipeline_inputs(self, inputs: RAGEvaluationInput) -> List[Dict[
rag_inputs = deepcopy(inputs.additional_rag_inputs)
if query_embedder_name not in rag_inputs:
rag_inputs[query_embedder_name] = {}
rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(inputs.queries)
rag_inputs[query_embedder_name][query_embedder_text_input] = deepcopy(
inputs.queries
)
else:
rag_inputs = {query_embedder_name: {query_embedder_text_input: deepcopy(inputs.queries)}}
rag_inputs = {
query_embedder_name: {
query_embedder_text_input: deepcopy(inputs.queries)
}
}

separate_rag_inputs = deaggregate_batched_pipeline_inputs(rag_inputs)
return separate_rag_inputs

def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) -> Dict[str, Dict[str, Any]]:
def _prepare_eval_pipeline_additional_inputs(
self, inputs: RAGEvaluationInput
) -> Dict[str, Dict[str, Any]]:
eval_inputs: Dict[str, Dict[str, List[Any]]] = {}

for metric in self.metrics:
Expand All @@ -302,18 +333,30 @@ def _prepare_eval_pipeline_additional_inputs(self, inputs: RAGEvaluationInput) -
RAGEvaluationMetric.DOCUMENT_RECALL_MULTI_HIT,
):
if inputs.ground_truth_documents is None:
raise ValueError(f"Ground truth documents required for metric '{metric.value}'.")
raise ValueError(
f"Ground truth documents required for metric '{metric.value}'."
)
if len(inputs.ground_truth_documents) != len(inputs.queries):
raise ValueError("Length of ground truth documents should match the number of queries.")
raise ValueError(
"Length of ground truth documents should match the number of queries."
)

eval_inputs[metric.value] = {"ground_truth_documents": inputs.ground_truth_documents}
eval_inputs[metric.value] = {
"ground_truth_documents": inputs.ground_truth_documents
}
elif metric == RAGEvaluationMetric.SEMANTIC_ANSWER_SIMILARITY:
if inputs.ground_truth_answers is None:
raise ValueError(f"Ground truth answers required for metric '{metric.value}'.")
raise ValueError(
f"Ground truth answers required for metric '{metric.value}'."
)
if len(inputs.ground_truth_answers) != len(inputs.queries):
raise ValueError("Length of ground truth answers should match the number of queries.")
raise ValueError(
"Length of ground truth answers should match the number of queries."
)

eval_inputs[metric.value] = {"ground_truth_answers": inputs.ground_truth_answers}
eval_inputs[metric.value] = {
"ground_truth_answers": inputs.ground_truth_answers
}
elif metric == RAGEvaluationMetric.ANSWER_FAITHFULNESS:
eval_inputs[metric.value] = {"questions": inputs.queries}

Expand All @@ -326,13 +369,20 @@ def _validate_rag_components(
):
for e in RAGExpectedComponent:
if e not in components:
raise ValueError(f"RAG evaluation harness requires metadata for the '{e.value}' component.")
raise ValueError(
f"RAG evaluation harness requires metadata for the '{e.value}' component."
)

pipeline_outputs = pipeline.outputs(include_components_with_connected_outputs=True)
pipeline_outputs = pipeline.outputs(
include_components_with_connected_outputs=True
)
pipeline_inputs = pipeline.inputs(include_components_with_connected_inputs=True)

for component, metadata in components.items():
if metadata.name not in pipeline_outputs or metadata.name not in pipeline_inputs: