refactor!: UptrainEvaluator contexts are now passed as nested lists. (

deepset-ai#343) * refactor!: Contexts are now passed as nested lists. The evaluator will automatically flatten them. This change allows other evaluators that take nested lists for contexts (like DeepEval) share the same, basic interface. * Update example * Update docstring
jdixosnd · Feb 6, 2024 · 32b6cc6 · 32b6cc6
1 parent 6d18bc4
commit 32b6cc6
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 19 deletions.
diff --git a/integrations/uptrain/example/example.py b/integrations/uptrain/example/example.py
@@ -8,8 +8,12 @@
  "Who created the Python language?",
 ]
 CONTEXTS = [
- "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
- "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects.",
+ [
+ "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
+ ],
+ [
+ "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+ ],
 ]
 RESPONSES = [
  "Football is the most popular sport with around 4 billion followers worldwide",

diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/evaluator.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/evaluator.py
@@ -81,7 +81,7 @@ def run(self, **inputs) -> Dict[str, Any]:
  # UpTrainMetric class' documentation for more details.
  output = pipeline.run({"evaluator": {
  "questions": ["question],
- "contexts": ["context"],
+ "contexts": [["context", "another context"]],
  "responses": ["response"]
  }})
  ```

diff --git a/integrations/uptrain/src/haystack_integrations/components/evaluators/metrics.py b/integrations/uptrain/src/haystack_integrations/components/evaluators/metrics.py
@@ -15,11 +15,11 @@ class UpTrainMetric(Enum):
  """
 
  #: Context relevance.
- #: Inputs - `questions: List[str], contexts: List[str]`
+ #: Inputs - `questions: List[str], contexts: List[List[str]]`
  CONTEXT_RELEVANCE = "context_relevance"
 
  #: Factual accuracy.
- #: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
+ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
  FACTUAL_ACCURACY = "factual_accuracy"
 
  #: Response relevance.
@@ -31,11 +31,11 @@ class UpTrainMetric(Enum):
  RESPONSE_COMPLETENESS = "response_completeness"
 
  #: Response completeness with respect to context.
- #: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
+ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
  RESPONSE_COMPLETENESS_WRT_CONTEXT = "response_completeness_wrt_context"
 
  #: Response consistency.
- #: Inputs - `questions: List[str], contexts: List[str], responses: List[str]`
+ #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]`
  RESPONSE_CONSISTENCY = "response_consistency"
 
  #: Response conciseness.
@@ -174,8 +174,8 @@ def _validate_input_elements(**kwargs):
  f"got '{type(collection).__name__}' instead"
  )
  raise ValueError(msg)
- elif not all(isinstance(x, str) for x in collection):
- msg = f"UpTrain evaluator expects inputs to be of type 'str' in '{k}'"
+ elif not all(isinstance(x, str) for x in collection) and not all(isinstance(x, list) for x in collection):
+ msg = f"UpTrain evaluator expects inputs to be of type 'str' or 'list' in '{k}'"
  raise ValueError(msg)
 
  same_length = len({len(x) for x in kwargs.values()}) == 1
@@ -190,21 +190,28 @@ def validate_input_parameters(metric: UpTrainMetric, expected: Dict[str, Any], r
  msg = f"UpTrain evaluator expected input parameter '{param}' for metric '{metric}'"
  raise ValueError(msg)
 
+ @staticmethod
+ def _convert_contexts(contexts: List[List[str]]) -> List[str]:
+ if not all(isinstance(x, list) for x in contexts):
+ msg = "UpTrain evaluator expected 'contexts' to be a nested list of strings"
+ raise ValueError(msg)
+ return ["\n\n".join(c) for c in contexts]
+
  @staticmethod
  def question_context_response(
- questions: List[str], contexts: List[str], responses: List[str]
+ questions: List[str], contexts: List[List[str]], responses: List[str]
  ) -> Iterable[Dict[str, str]]:
  InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses)
- for q, c, r in zip(questions, contexts, responses): # type: ignore
+ for q, c, r in zip(questions, InputConverters._convert_contexts(contexts), responses): # type: ignore
  yield {"question": q, "context": c, "response": r}
 
  @staticmethod
  def question_context(
  questions: List[str],
- contexts: List[str],
+ contexts: List[List[str]],
  ) -> Iterable[Dict[str, str]]:
  InputConverters._validate_input_elements(questions=questions, contexts=contexts)
- for q, c in zip(questions, contexts): # type: ignore
+ for q, c in zip(questions, InputConverters._convert_contexts(contexts)): # type: ignore
  yield {"question": q, "context": c}
 
  @staticmethod

diff --git a/integrations/uptrain/tests/test_evaluator.py b/integrations/uptrain/tests/test_evaluator.py
@@ -14,8 +14,13 @@
  "Who created the Python language?",
 ]
 DEFAULT_CONTEXTS = [
- "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
- "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects.",
+ [
+ "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact.",
+ "Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people.",
+ ],
+ [
+ "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+ ],
 ]
 DEFAULT_RESPONSES = [
  "Football is the most popular sport with around 4 billion followers worldwide",
@@ -238,24 +243,24 @@ def test_evaluator_invalid_inputs(os_environ_get, metric, inputs, error_string,
 @pytest.mark.parametrize(
  "metric, inputs, expected_outputs, metric_params",
  [
- (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": ["q1"], "contexts": ["c1"]}, [[(None, 0.5, "1")]], None),
+ (UpTrainMetric.CONTEXT_RELEVANCE, {"questions": ["q1"], "contexts": [["c1"]]}, [[(None, 0.5, "1")]], None),
  (
  UpTrainMetric.FACTUAL_ACCURACY,
- {"questions": ["q2"], "contexts": ["c2"], "responses": ["r2"]},
+ {"questions": ["q2"], "contexts": [["c2"]], "responses": ["r2"]},
  [[(None, 1.0, "2")]],
  None,
  ),
  (UpTrainMetric.RESPONSE_RELEVANCE, {"questions": ["q3"], "responses": ["r3"]}, [[(None, 1.0, "3")]], None),
  (UpTrainMetric.RESPONSE_COMPLETENESS, {"questions": ["q4"], "responses": ["r4"]}, [[(None, 0.5, "4")]], None),
  (
  UpTrainMetric.RESPONSE_COMPLETENESS_WRT_CONTEXT,
- {"questions": ["q5"], "contexts": ["c5"], "responses": ["r5"]},
+ {"questions": ["q5"], "contexts": [["c5"]], "responses": ["r5"]},
  [[(None, 1.0, "5")]],
  None,
  ),
  (
  UpTrainMetric.RESPONSE_CONSISTENCY,
- {"questions": ["q6"], "contexts": ["c6"], "responses": ["r6"]},
+ {"questions": ["q6"], "contexts": [["c6"]], "responses": ["r6"]},
  [[(None, 0.9, "6")]],
  None,
  ),