Fix YAML pipeline paths in docker-compose.yml (#2335)

* Rename YAML files in docker-compose files * Make read_pipeline_config_from_yaml fail on wrong path * Validate indexing config in rest api * Update Documentation & Code Style * Add note about autocompletion of YAML Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
deepset-ai · Mar 21, 2022 · 5454d57
1 parent 853c360
commit 5454d57
Show file tree

Hide file tree

Showing 10 changed files with 35 additions and 10 deletions.
diff --git a/docker-compose-gpu.yml b/docker-compose-gpu.yml
@@ -23,7 +23,7 @@ services:
  environment:
  # See rest_api/pipelines.yaml for configurations of Search & Indexing Pipeline.
  - DOCUMENTSTORE_PARAMS_HOST=elasticsearch
- - PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.yaml
+ - PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines_dpr.haystack-pipeline.yml
  - CONCURRENT_REQUEST_PER_WORKER
  depends_on:
  - elasticsearch

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -14,7 +14,7 @@ services:
  environment:
  # See rest_api/pipelines.yaml for configurations of Search & Indexing Pipeline.
  - DOCUMENTSTORE_PARAMS_HOST=elasticsearch
- - PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines.yaml
+ - PIPELINE_YAML_PATH=/home/user/rest_api/pipeline/pipelines.haystack-pipeline.yml
  - CONCURRENT_REQUEST_PER_WORKER
  depends_on:
  - elasticsearch

diff --git a/docs/_src/api/api/evaluation.md b/docs/_src/api/api/evaluation.md
@@ -123,7 +123,7 @@ Print the evaluation results
 #### semantic\_answer\_similarity
 
 ```python
-def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2") -> Tuple[List[float], List[float]]
+def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True) -> Tuple[List[float], List[float]]
 ```
 
 Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
@@ -137,6 +137,9 @@ Returns per QA pair a) the similarity of the most likely prediction (top 1) to a
 - `gold_labels`: Labels as list of multiple possible answers per question
 - `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string
 pointing to downloadable models.
+- `batch_size`: Number of prediction label pairs to encode at once.
+- `use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
 
 **Returns**:
 

diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md
@@ -466,6 +466,9 @@ If you use custom cross encoders please make sure they work with sentence_transf
 - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
 - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
 - Large model for German only: "deepset/gbert-large-sts"
+- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS.
+- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+Falls back to CPU if no GPU is available.
 - `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode.
 This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node.
 If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance.

diff --git a/haystack/nodes/evaluator/evaluator.py b/haystack/nodes/evaluator/evaluator.py
@@ -394,7 +394,7 @@ def semantic_answer_similarity(
  gold_labels: List[List[str]],
  sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
  batch_size: int = 32,
- use_gpu: bool = True
+ use_gpu: bool = True,
 ) -> Tuple[List[float], List[float]]:
  """
  Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
@@ -416,8 +416,8 @@ def semantic_answer_similarity(
  cross_encoder_used = False
  if config.architectures is not None:
  cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures)
- 
- device = None if use_gpu else 'cpu'
+
+ device = None if use_gpu else "cpu"
 
  # Compute similarities
  top_1_sas = []

diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py
@@ -768,8 +768,11 @@ def eval(
  gold_labels = df["gold_answers"].values
  predictions = [[a] for a in df["answer"].values]
  sas, _ = semantic_answer_similarity(
- predictions=predictions, gold_labels=gold_labels, sas_model_name_or_path=sas_model_name_or_path,
- batch_size=sas_batch_size, use_gpu=sas_use_gpu
+ predictions=predictions,
+ gold_labels=gold_labels,
+ sas_model_name_or_path=sas_model_name_or_path,
+ batch_size=sas_batch_size,
+ use_gpu=sas_use_gpu,
  )
  df["sas"] = sas
 

diff --git a/haystack/pipelines/config.py b/haystack/pipelines/config.py
@@ -69,7 +69,13 @@ def get_component_definitions(pipeline_config: Dict[str, Any], overwrite_with_en
  return component_definitions
 
 
-def read_pipeline_config_from_yaml(path: Path):
+def read_pipeline_config_from_yaml(path: Path) -> Dict[str, Any]:
+ """
+ Parses YAML files into Python objects.
+ Fails if the file does not exist.
+ """
+ if not os.path.isfile(path):
+ raise FileNotFoundError(f"Not found: {path}")
  with open(path, "r", encoding="utf-8") as stream:
  return yaml.safe_load(stream)
 

diff --git a/rest_api/controller/file_upload.py b/rest_api/controller/file_upload.py
@@ -12,7 +12,12 @@
 
 from haystack.pipelines.base import Pipeline
 from haystack.errors import PipelineConfigError
-from haystack.pipelines.config import get_component_definitions, get_pipeline_definition, read_pipeline_config_from_yaml
+from haystack.pipelines.config import (
+ get_component_definitions,
+ get_pipeline_definition,
+ read_pipeline_config_from_yaml,
+ validate_config,
+)
 from rest_api.config import PIPELINE_YAML_PATH, FILE_UPLOAD_PATH, INDEXING_PIPELINE_NAME
 from rest_api.controller.utils import as_form
 
@@ -22,6 +27,7 @@
 
 try:
  pipeline_config = read_pipeline_config_from_yaml(Path(PIPELINE_YAML_PATH))
+ validate_config(pipeline_config)
  pipeline_definition = get_pipeline_definition(pipeline_config=pipeline_config, pipeline_name=INDEXING_PIPELINE_NAME)
  component_definitions = get_component_definitions(
  pipeline_config=pipeline_config, overwrite_with_env_variables=True

diff --git a/rest_api/pipeline/pipelines.haystack-pipeline.yml b/rest_api/pipeline/pipelines.haystack-pipeline.yml