Skip to content

Commit

Permalink
feat: Add new LLMEvaluator component (#7401)
Browse files Browse the repository at this point in the history
* draft llm evaluator

* docstrings

* flexible inputs; validate inputs and outputs

* add tests

* add release note

* remove example

* docstrings

* make outputs parameter optional. default:

* validate init parameters

* linting

* remove mention of binary scores from template

* make examples and outputs params non-optional

* removed leftover from optional outputs param

* simplify building examples section for template

* validate inputs and outputs in examples are dict with str as key

* fix pylint too-many-boolean-expressions

* increase test coverage
  • Loading branch information
julian-risch committed Mar 25, 2024
1 parent 42b587a commit bfd0d3e
Show file tree
Hide file tree
Showing 4 changed files with 635 additions and 1 deletion.
3 changes: 2 additions & 1 deletion haystack/components/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .answer_exact_match import AnswerExactMatchEvaluator
from .llm_evaluator import LLMEvaluator

__all__ = ["AnswerExactMatchEvaluator"]
__all__ = ["AnswerExactMatchEvaluator", "LLMEvaluator"]
298 changes: 298 additions & 0 deletions haystack/components/evaluators/llm_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
import json
from typing import Any, Dict, List, Tuple, Type

from haystack import component, default_from_dict, default_to_dict
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret, deserialize_secrets_inplace


@component
class LLMEvaluator:
"""
Uses an LLM to evaluate inputs based on a prompt containing instructions and examples.
The default API requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
The inputs are lists that are user-defined depending on the desired metric.
The output is a dictionary with a key `results` containing a list of evaluation results.
Each result is a dictionary with user-defined keys and values of either 0 for FALSE or 1 for TRUE respectively.
Usage example:
```python
from haystack.components.evaluators import LLMEvaluator
evaluator = LLMEvaluator(
instructions="Is this answer problematic for children?",
inputs=[("responses", List[str])],
outputs=["score"],
)
RESPONSES = [
"Football is the most popular sport with around 4 billion followers worldwide",
"Python language was created by Guido van Rossum.",
]
results = evaluator.run(responses=RESPONSES)
```
"""

def __init__(
self,
instructions: str,
inputs: List[Tuple[str, Type[List]]],
outputs: List[str],
examples: List[Dict[str, Any]],
*,
api: str = "openai",
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
):
"""
Creates an instance of LLMEvaluator.
:param instructions:
The prompt instructions to use for evaluation.
Should be a question about the inputs that can be answered with yes or no.
:param inputs:
The inputs that the component expects as incoming connections and that it evaluates.
Each input is a tuple of an input name and input type. Input types must be lists.
:param outputs:
Output names of the evaluation results. They correspond to keys in the output dictionary.
The default is a single key "score".
:param examples:
Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
`outputs` parameters.
Each example is a dictionary with keys "inputs" and "outputs"
They contain the input and output as dictionaries respectively.
:param api:
The API to use for calling an LLM through a Generator.
Supported APIs: "openai".
:param api_key:
The API key.
"""
self.validate_init_parameters(inputs, outputs, examples)

self.instructions = instructions
self.inputs = inputs
self.outputs = outputs
self.examples = examples
self.api = api
self.api_key = api_key

if api == "openai":
self.generator = OpenAIGenerator(api_key=api_key)
else:
raise ValueError(f"Unsupported API: {api}")

template = self.prepare_template()
self.builder = PromptBuilder(template=template)

component.set_input_types(self, **dict(inputs))

def validate_init_parameters(
self, inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]]
):
"""
Validate the init parameters.
:param inputs:
The inputs to validate.
:param outputs:
The outputs to validate.
:param examples:
The examples to validate.
:raises ValueError:
If the inputs are not a list of tuples with a string and a type of list.
If the outputs are not a list of strings.
If the examples are not a list of dictionaries.
If any example does not have keys "inputs" and "outputs" with values that are dictionaries with string keys.
"""
# Validate inputs
if (
not isinstance(inputs, list)
or not all(isinstance(input, tuple) for input in inputs)
or not all(isinstance(input[0], str) and input[1] is not list and len(input) == 2 for input in inputs)
):
msg = (
f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and "
f"type of list but received {inputs}."
)
raise ValueError(msg)

# Validate outputs
if not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs):
msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
raise ValueError(msg)

# Validate examples are lists of dicts
if not isinstance(examples, list) or not all(isinstance(example, dict) for example in examples):
msg = f"LLM evaluator expects examples to be a list of dictionaries but received {examples}."
raise ValueError(msg)

# Validate each example
for example in examples:
if (
{"inputs", "outputs"} != example.keys()
or not all(isinstance(example[param], dict) for param in ["inputs", "outputs"])
or not all(isinstance(key, str) for param in ["inputs", "outputs"] for key in example[param])
):
msg = (
f"LLM evaluator expects each example to have keys `inputs` and `outputs` with values that are "
f"dictionaries with str keys but received {example}."
)
raise ValueError(msg)

@component.output_types(results=List[Dict[str, Any]])
def run(self, **inputs) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param inputs:
The input values to evaluate. The keys are the input names and the values are lists of input values.
:returns:
A dictionary with a single `results` entry that contains a list of results.
Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
and the evaluation results as the values.
"""
self.validate_input_parameters(dict(self.inputs), inputs)

# inputs is a dictionary with keys being input names and values being a list of input values
# We need to iterate through the lists in parallel for all keys of the dictionary
input_names, values = inputs.keys(), list(zip(*inputs.values()))
list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]

results = []
for input_names_to_values in list_of_input_names_to_values:
prompt = self.builder.run(**input_names_to_values)
result = self.generator.run(prompt=prompt["prompt"])

self.validate_outputs(expected=self.outputs, received=result["replies"][0])
parsed_result = json.loads(result["replies"][0])
parsed_result["name"] = "llm"
results.append(parsed_result)

return {"results": results}

def prepare_template(self) -> str:
"""
Combine instructions, inputs, outputs, and examples into one prompt template with the following format:
Instructions:
<instructions>
Generate the response in JSON format with the following keys:
<list of output keys>
Consider the instructions and the examples below to determine those values.
Examples:
<examples>
Inputs:
<inputs>
Outputs:
:returns:
The prompt template.
"""
inputs_section = (
"{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
)

examples_section = "\n".join(
[
"Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
for example in self.examples
]
)
return (
f"Instructions:\n"
f"{self.instructions}\n\n"
f"Generate the response in JSON format with the following keys:\n"
f"{json.dumps(self.outputs)}\n"
f"Consider the instructions and the examples below to determine those values.\n\n"
f"Examples:\n"
f"{examples_section}\n\n"
f"Inputs:\n"
f"{inputs_section}\n"
f"Outputs:\n"
)

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
:returns:
The serialized component as a dictionary.
"""
return default_to_dict(
self,
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
examples=self.examples,
api=self.api,
api_key=self.api_key.to_dict(),
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
"""
Deserialize this component from a dictionary.
:param data:
The dictionary representation of this component.
:returns:
The deserialized component instance.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

@staticmethod
def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None:
"""
Validate the input parameters.
:param expected:
The expected input parameters.
:param received:
The received input parameters.
:raises ValueError:
If not all expected inputs are present in the received inputs
If the received inputs are not lists or have different lengths
"""
# Validate that all expected inputs are present in the received inputs
for param in expected.keys():
if param not in received:
msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}."
raise ValueError(msg)

# Validate that all received inputs are lists
if not all(isinstance(input, list) for input in received.values()):
msg = f"LLM evaluator expects all input values to be lists but received {[type(input) for input in received.values()]}."
raise ValueError(msg)

# Validate that all received inputs are of the same length
inputs = received.values()
length = len(next(iter(inputs)))
if not all(len(input) == length for input in inputs):
msg = (
f"LLM evaluator expects all input lists to have the same length but received {inputs} with lengths "
f"{[len(input) for input in inputs]}."
)
raise ValueError(msg)

@staticmethod
def validate_outputs(expected: List[str], received: str) -> None:
"""
Validate the output.
:param expected:
Names of expected outputs
:param received:
Names of received outputs
:raises ValueError:
If not all expected outputs are present in the received outputs
"""
parsed_output = json.loads(received)
if not all(output in parsed_output for output in expected):
msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
raise ValueError(msg)
4 changes: 4 additions & 0 deletions releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add a new LLMEvaluator component that leverages LLMs through the OpenAI api to evaluate pipelines.
Loading

0 comments on commit bfd0d3e

Please sign in to comment.