Skip to content

Commit

Permalink
refactor!: UptrainEvaluator contexts are now passed as nested lists. (
Browse files Browse the repository at this point in the history
deepset-ai#343)

* refactor!: Contexts are now passed as nested lists.

The evaluator will automatically flatten them. This change allows other evaluators that take nested lists for contexts (like DeepEval) share the same, basic interface.

* Update example

* Update docstring
  • Loading branch information
shadeMe committed Feb 6, 2024
1 parent 6d18bc4 commit 32b6cc6
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
8 changes: 6 additions & 2 deletions integrations/uptrain/example/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
"Who created the Python language?",
]
CONTEXTS = [
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects.",
[
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
],
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
],
]
RESPONSES = [
"Football is the most popular sport with around 4 billion followers worldwide",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def run(self, **inputs) -> Dict[str, Any]:
# UpTrainMetric class' documentation for more details.
output = pipeline.run({"evaluator": {
"questions": ["question],
"contexts": ["context"],
"contexts": [["context", "another context"]],
"responses": ["response"]
}})
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ class UpTrainMetric(Enum):
"""

#: Context relevance.
#: Inputs - `questions: List[str], contexts: List[str]`
#: Inputs - `questions: List[str], contexts: List[List[str]]`
CONTEXT_RELEVANCE = "context_relevance"

#: Factual accuracy.
#: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
#: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
FACTUAL_ACCURACY = "factual_accuracy"

#: Response relevance.
Expand All @@ -31,11 +31,11 @@ class UpTrainMetric(Enum):
RESPONSE_COMPLETENESS = "response_completeness"

#: Response completeness with respect to context.
#: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
#: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
RESPONSE_COMPLETENESS_WRT_CONTEXT = "response_completeness_wrt_context"

#: Response consistency.
#: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
#: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
RESPONSE_CONSISTENCY = "response_consistency"

#: Response conciseness.
Expand Down Expand Up @@ -174,8 +174,8 @@ def _validate_input_elements(**kwargs):
f"got '{type(collection).__name__}' instead"
)
raise ValueError(msg)
elif not all(isinstance(x, str) for x in collection):
msg = f"UpTrain evaluator expects inputs to be of type 'str' in '{k}'"
elif not all(isinstance(x, str) for x in collection) and not all(isinstance(x, list) for x in collection):
msg = f"UpTrain evaluator expects inputs to be of type 'str' or 'list' in '{k}'"
raise ValueError(msg)

same_length = len({len(x) for x in kwargs.values()}) == 1
Expand All @@ -190,21 +190,28 @@ def validate_input_parameters(metric: UpTrainMetric, expected: Dict[str, Any], r
msg = f"UpTrain evaluator expected input parameter '{param}' for metric '{metric}'"
raise ValueError(msg)

@staticmethod
def _convert_contexts(contexts: List[List[str]]) -> List[str]:
if not all(isinstance(x, list) for x in contexts):
msg = "UpTrain evaluator expected 'contexts' to be a nested list of strings"
raise ValueError(msg)
return ["\n\n".join(c) for c in contexts]

@staticmethod
def question_context_response(
questions: List[str], contexts: List[str], responses: List[str]
questions: List[str], contexts: List[List[str]], responses: List[str]
) -> Iterable[Dict[str, str]]:
InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
for q, c, r in zip(questions, contexts, responses): # type: ignore
for q, c, r in zip(questions, InputConverters._convert_contexts(contexts), responses): # type: ignore
yield {"question": q, "context": c, "response": r}

@staticmethod
def question_context(
questions: List[str],
contexts: List[str],
contexts: List[List[str]],
) -> Iterable[Dict[str, str]]:
InputConverters._validate_input_elements(questions=questions, contexts=contexts)
for q, c in zip(questions, contexts): # type: ignore
for q, c in zip(questions, InputConverters._convert_contexts(contexts)): # type: ignore
yield {"question": q, "context": c}

@staticmethod
Expand Down
17 changes: 11 additions & 6 deletions integrations/uptrain/tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@
"Who created the Python language?",
]
DEFAULT_CONTEXTS = [
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects.",
[
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact.",
"Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
],
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
],
]
DEFAULT_RESPONSES = [
"Football is the most popular sport with around 4 billion followers worldwide",
Expand Down Expand Up @@ -238,24 +243,24 @@ def test_evaluator_invalid_inputs(os_environ_get, metric, inputs, error_string,
@pytest.mark.parametrize(
"metric, inputs, expected_outputs, metric_params",
[
(UpTrainMetric.CONTEXT_RELEVANCE, {"questions": ["q1"], "contexts": ["c1"]}, [[(None, 0.5, "1")]], None),
(UpTrainMetric.CONTEXT_RELEVANCE, {"questions": ["q1"], "contexts": [["c1"]]}, [[(None, 0.5, "1")]], None),
(
UpTrainMetric.FACTUAL_ACCURACY,
{"questions": ["q2"], "contexts": ["c2"], "responses": ["r2"]},
{"questions": ["q2"], "contexts": [["c2"]], "responses": ["r2"]},
[[(None, 1.0, "2")]],
None,
),
(UpTrainMetric.RESPONSE_RELEVANCE, {"questions": ["q3"], "responses": ["r3"]}, [[(None, 1.0, "3")]], None),
(UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": ["q4"], "responses": ["r4"]}, [[(None, 0.5, "4")]], None),
(
UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
{"questions": ["q5"], "contexts": ["c5"], "responses": ["r5"]},
{"questions": ["q5"], "contexts": [["c5"]], "responses": ["r5"]},
[[(None, 1.0, "5")]],
None,
),
(
UpTrainMetric.RESPONSE_CONSISTENCY,
{"questions": ["q6"], "contexts": ["c6"], "responses": ["r6"]},
{"questions": ["q6"], "contexts": [["c6"]], "responses": ["r6"]},
[[(None, 0.9, "6")]],
None,
),
Expand Down

0 comments on commit 32b6cc6

Please sign in to comment.