adding results

deepset-ai · May 30, 2024 · 993514a · 993514a
1 parent b0dea0b
commit 993514a
Show file tree

Hide file tree

Showing 174 changed files with 9,279 additions and 8 deletions.
diff --git a/evaluations/README.md b/evaluations/README.md
@@ -0,0 +1,47 @@
+# Evaluations
+
+| Dataset and Evaluation | Colab |
+|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| RAG over ARAGOG dataset | <a href="https://colab.research.google.com/github/deepset-ai/haystack-evaluation/blob/main/evaluations/evaluation_aragog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> |
+
+
+
+## ARAGOG
+
+This dataset is based on the paper [Advanced Retrieval Augmented Generation Output Grading (ARAGOG)](https://arxiv.org/pdf/2404.01037).
+It's a collection of papers from ArXiv covering topics around Transformers and Large Language Models, all in PDF format. 
+
+The dataset contains:
+- 13 PDF papers 
+- 107 questions and answers generated with the assistance of GPT-4, and validated/corrected by humans.
+
+It has human annotations for the following metrics:
+- [ContextRelevance](https://docs.haystack.deepset.ai/docs/contextrelevanceevaluator)
+- [Faithfulness](https://docs.haystack.deepset.ai/docs/faithfulnessevaluator)
+- [Semantic Answer Similarity](https://docs.haystack.deepset.ai/docs/sasevaluator)
+
+Check the [RAG over ARAGOG dataset notebook](aragog_evaluation.ipynb) for an example.
+
+
+---
+
+## SQuAD dataset 
+
+The SQuAD dataset is a collection of questions and answers from Wikipedia articles. 
+This dataset is typically used for training and evaluating models for extractive question-answering tasks.
+
+The dataset contains:
+- 490 Wikipedia articles in text format
+- 98k questions whose answers are spans in the articles
+
+It contains human annotations suitable for the following metrics:
+- [Answer Exact Match](https://docs.haystack.deepset.ai/docs/answerexactmatchevaluator)
+- [DocumentMRR](https://docs.haystack.deepset.ai/docs/documentmrrevaluator)
+- [DocumentMAP](https://docs.haystack.deepset.ai/docs/documentmapevaluator)
+- [DocumentRecall](https://docs.haystack.deepset.ai/docs/documentrecallevaluator)
+- [Semantic Answer Similarity](https://docs.haystack.deepset.ai/docs/sasevaluator)
+
+
+Check the [RAG over SQuAD notebook](squad_rag_evaluation.ipynb) for an example.
+
+Check the [Extractive QA over SQuAD notebook](squad_extractive_qa_evaluation.ipynb) for an example.
diff --git a/architectures/README.md → evaluations/architectures/README.md b/architectures/README.md → evaluations/architectures/README.md
diff --git a/architectures/basic_rag.py → evaluations/architectures/basic_rag.py b/architectures/basic_rag.py → evaluations/architectures/basic_rag.py
diff --git a/architectures/extractive_qa.py → evaluations/architectures/extractive_qa.py b/architectures/extractive_qa.py → evaluations/architectures/extractive_qa.py
diff --git a/architectures/hyde_rag.py → evaluations/architectures/hyde_rag.py b/architectures/hyde_rag.py → evaluations/architectures/hyde_rag.py
diff --git a/aragog_evaluation.ipynb → evaluations/evaluation_aragog.ipynb b/aragog_evaluation.ipynb → evaluations/evaluation_aragog.ipynb
diff --git a/evaluation_arago.py → evaluations/evaluation_aragog.py b/evaluation_arago.py → evaluations/evaluation_aragog.py
@@ -110,19 +110,19 @@ def parameter_tuning(questions, answers):
  out_path.mkdir(exist_ok=True)
 
  for embedding_model in embedding_models:
- for top_k in top_k_values:
- for chunk_size in chunk_sizes:
+ for chunk_size in chunk_sizes:
+ print("Indexing documents")
+ doc_store = indexing(embedding_model, chunk_size)
+ for top_k in top_k_values:
  name_params = f"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}"
  print(name_params)
- print("Indexing documents")
- doc_store = indexing(embedding_model, chunk_size)
  print("Running RAG pipeline")
  retrieved_contexts, predicted_answers = run_basic_rag(doc_store, questions, embedding_model, top_k)
  print(f"Running evaluation")
  results, inputs = run_evaluation(questions, answers, retrieved_contexts, predicted_answers, embedding_model)
  eval_results = EvaluationRunResult(run_name=name_params, inputs=inputs, results=results)
- eval_results.score_report().to_csv(f"{out_path}/score_report_{name_params}.csv")
- eval_results.to_pandas().to_csv(f"{out_path}/detailed_{name_params}.csv")
+ eval_results.score_report().to_csv(f"{out_path}/score_report_{name_params}.csv", index=False)
+ eval_results.to_pandas().to_csv(f"{out_path}/detailed_{name_params}.csv", index=False)
 
 
 def main():

diff --git a/evaluation_squad_extractive_qa.ipynb → ...ions/evaluation_squad_extractive_qa.ipynb b/evaluation_squad_extractive_qa.ipynb → ...ions/evaluation_squad_extractive_qa.ipynb
@@ -77,13 +77,13 @@
  },
  {
  "cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
  "id": "c79d6283-c639-474e-bb19-0be485ce1df9",
  "metadata": {},
  "outputs": [],
  "source": [
  "from tqdm import tqdm\n",
- "from architectures.extractive_qa import get_extractive_qa_pipeline\n",
+ "from architectures.basic_rag import basic_rag\n",
  "\n",
  "def run_extractive_qa(doc_store, questions, embedding_model, top_k_retriever):\n",
  "\n",

diff --git a/evaluation_squad_extractive_qa.py → ...uations/evaluation_squad_extractive_qa.py b/evaluation_squad_extractive_qa.py → ...uations/evaluation_squad_extractive_qa.py
diff --git a/evaluation_squad_rag.py → evaluations/evaluation_squad_rag.py b/evaluation_squad_rag.py → evaluations/evaluation_squad_rag.py
diff --git a/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:128.csv b/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:128.csv
diff --git a/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:256.csv b/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:256.csv
diff --git a/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:64.csv b/evaluations/results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:1__chunk_size:64.csv