cleaning and fixing aragog script and adding squad rag notebook

deepset-ai · May 30, 2024 · 62c3f6c · 62c3f6c
1 parent 993514a
commit 62c3f6c
Show file tree

Hide file tree

Showing 2 changed files with 298 additions and 16 deletions.
diff --git a/evaluations/evaluation_aragog.py b/evaluations/evaluation_aragog.py
@@ -1,5 +1,7 @@
+import argparse
 import json
 import os
+import random
 from pathlib import Path
 from typing import Tuple, List
 
@@ -17,12 +19,15 @@
 from tqdm import tqdm
 
 from architectures.basic_rag import basic_rag
-from utils import timeit
+from utils.utils import timeit
+
+base_path = "../datasets/ARAGOG/"
 
 
 @timeit
 def indexing(embedding_model: str, chunk_size: int):
- files_path = "datasets/ARAGOG/papers_for_questions"
+ full_path = Path(base_path)
+ files_path = full_path / "papers_for_questions"
  document_store = InMemoryDocumentStore()
  pipeline = Pipeline()
  pipeline.add_component("converter", PyPDFToDocument())
@@ -34,14 +39,14 @@ def indexing(embedding_model: str, chunk_size: int):
  pipeline.connect("cleaner", "splitter")
  pipeline.connect("splitter", "embedder")
  pipeline.connect("embedder", "writer")
- pdf_files = [files_path+"/"+f_name for f_name in os.listdir(files_path)]
+ pdf_files = [full_path / "papers_for_questions" / f_name for f_name in os.listdir(files_path)]
  pipeline.run({"converter": {"sources": pdf_files}})
 
  return document_store
 
 
 def read_question_answers() -> Tuple[List[str], List[str]]:
- with open("datasets/ARAGOG/eval_questions.json", "r") as f:
+ with open (base_path + "eval_questions.json", "r") as f:
  data = json.load(f)
  questions = data["questions"]
  answers = data["ground_truths"]
@@ -75,23 +80,33 @@ def run_basic_rag(doc_store, sample_questions, embedding_model, top_k):
 
 @timeit
 def run_evaluation(sample_questions, sample_answers, retrieved_contexts, predicted_answers, embedding_model):
- context_relevance = ContextRelevanceEvaluator(raise_on_failure=False)
- faithfulness = FaithfulnessEvaluator(raise_on_failure=False)
- sas = SASEvaluator(model=embedding_model)
- sas.warm_up()
+ eval_pipeline = Pipeline()
+ eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False))
+ eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False))
+ eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))
+
+ eval_pipeline_results = eval_pipeline.run(
+ {
+ "context_relevance": {"questions": sample_questions, "contexts": retrieved_contexts},
+ "faithfulness": {
+ "questions": sample_questions, "contexts": retrieved_contexts, "predicted_answers": predicted_answers
+ },
+ "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": sample_answers},
+ }
+ )
 
  results = {
- "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
- "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
- "sas": sas.run(predicted_answers, sample_answers),
+ "context_relevance": eval_pipeline_results['context_relevance'],
+ "faithfulness": eval_pipeline_results['faithfulness'],
+ "sas": eval_pipeline_results['sas']
  }
 
- inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
+ inputs = {'questions': sample_questions, 'true_answers': sample_answers, 'predicted_answers': predicted_answers}
 
  return results, inputs
 
 
-def parameter_tuning(questions, answers):
+def parameter_tuning(questions, answers, out_path: str):
  """
  Run the basic RAG model with different parameters, and evaluate the results.
 
@@ -106,12 +121,12 @@ def parameter_tuning(questions, answers):
  chunk_sizes = [64, 128, 256]
 
  # create results directory if it does not exist using Pathlib
- out_path = Path("aragog_results")
+ out_path = Path(out_path)
  out_path.mkdir(exist_ok=True)
 
  for embedding_model in embedding_models:
  for chunk_size in chunk_sizes:
- print("Indexing documents")
+ print(f"Indexing documents with {embedding_model} model with a chunk_size={chunk_size}")
  doc_store = indexing(embedding_model, chunk_size)
  for top_k in top_k_values:
  name_params = f"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}"
@@ -125,9 +140,28 @@ def parameter_tuning(questions, answers):
  eval_results.to_pandas().to_csv(f"{out_path}/detailed_{name_params}.csv", index=False)
 
 
+def create_args():
+ parser = argparse.ArgumentParser(description='Run the ARAGOG dataset evaluation on a RAG pipeline')
+ parser.add_argument(
+ '--output_dir',
+ type=str,
+ help='The output directory for the results',
+ required=True
+ )
+ parser.add_argument('--sample', type=int, help='The number of questions to sample')
+ return parser.parse_args()
+
+
 def main():
+ args = create_args()
  questions, answers = read_question_answers()
- parameter_tuning(questions, answers)
+
+ if args.sample:
+ random.seed(42)
+ questions = random.sample(questions, args.sample)
+ answers = random.sample(answers, args.sample)
+
+ parameter_tuning(questions, answers, args.output_dir)
 
 
 if __name__ == '__main__':

diff --git a/evaluations/evaluation_squad_rag.ipynb b/evaluations/evaluation_squad_rag.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3066ab55-72ac-4669-b303-8482877149f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import os\n",
+ "import random\n",
+ "\n",
+ "from haystack import Document\n",
+ "\n",
+ "base_path = \"datasets/SQuAD-2.0/transformed_squad/\"\n",
+ "\n",
+ "def load_transformed_squad():\n",
+ " with open(base_path+\"questions.jsonl\", \"r\") as f:\n",
+ " questions = [json.loads(x) for x in f.readlines()]\n",
+ " for idx, question in enumerate(questions):\n",
+ " question[\"query_id\"] = f\"query_{idx}\"\n",
+ "\n",
+ " def create_document(text: str, name: str):\n",
+ " return Document(content=text, meta={\"name\": name})\n",
+ "\n",
+ " documents = []\n",
+ " for root, dirs, files in os.walk(base_path+\"articles\"):\n",
+ " for article in files:\n",
+ " with open(f\"{root}/{article}\", \"r\") as f:\n",
+ " article_text = f.read()\n",
+ " documents.append(create_document(article_text, article.replace(\".txt\", \"\")))\n",
+ "\n",
+ " return questions, documents"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "b0efeffd-0256-470b-8467-2680795aa326",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from haystack import Pipeline, Document\n",
+ "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+ "from haystack.document_stores.types import DuplicatePolicy\n",
+ "from haystack.components.evaluators.document_recall import RecallMode\n",
+ "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
+ "from haystack.components.preprocessors import DocumentSplitter\n",
+ "from haystack.components.writers import DocumentWriter\n",
+ "\n",
+ "def indexing(documents, embedding_model, chunk_size):\n",
+ " document_store = InMemoryDocumentStore()\n",
+ " doc_splitter = DocumentSplitter(split_by=\"sentence\", split_length=chunk_size)\n",
+ " doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n",
+ " doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)\n",
+ " ingestion_pipe = Pipeline()\n",
+ " ingestion_pipe.add_component(instance=doc_splitter, name=\"doc_splitter\")\n",
+ " ingestion_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
+ " ingestion_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
+ " ingestion_pipe.connect(\"doc_splitter.documents\", \"doc_embedder.documents\")\n",
+ " ingestion_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
+ " ingestion_pipe.run({\"doc_splitter\": {\"documents\": documents}})\n",
+ "\n",
+ " return document_store"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "257fb0bb-3d80-4b3c-b748-00ffa78e825a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tqdm import tqdm\n",
+ "from architectures.basic_rag import basic_rag\n",
+ "\n",
+ "def run_basic_rag(doc_store, questions, embedding_model, top_k):\n",
+ "\n",
+ " rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)\n",
+ "\n",
+ " # predicted data\n",
+ " retrieved_docs = []\n",
+ " retrieved_contexts = []\n",
+ " predicted_answers = []\n",
+ "\n",
+ " for q in tqdm(questions):\n",
+ " response = rag.run(\n",
+ " data={\"query_embedder\": {\"text\": q},\n",
+ " \"prompt_builder\": {\"question\": q},\n",
+ " \"answer_builder\": {\"query\": q}}\n",
+ " )\n",
+ "\n",
+ " # gather response data\n",
+ " retrieved_docs.append(response[\"answer_builder\"][\"answers\"][0].documents)\n",
+ " retrieved_contexts.append([doc.content for doc in response[\"answer_builder\"][\"answers\"][0].documents])\n",
+ " predicted_answers.append(response[\"answer_builder\"][\"answers\"][0].data)\n",
+ "\n",
+ " return retrieved_docs, predicted_answers, retrieved_contexts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e2a545b6-4fee-4277-b6dd-88675529ebb9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def run_evaluation(embedding_model, ground_truth_docs, retrieved_docs, questions, predicted_answers, ground_truth_answers):\n",
+ " eval_pipeline = Pipeline()\n",
+ " eval_pipeline.add_component(\"doc_mrr\", DocumentMRREvaluator())\n",
+ " eval_pipeline.add_component(\"doc_map\", DocumentMAPEvaluator())\n",
+ " eval_pipeline.add_component(\"doc_recall_single_hit\", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))\n",
+ " eval_pipeline.add_component(\"doc_recall_multi_hit\", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))\n",
+ " eval_pipeline.add_component(\"answer_exact\", AnswerExactMatchEvaluator())\n",
+ " eval_pipeline.add_component(\"sas\", SASEvaluator(model=embedding_model))\n",
+ "\n",
+ " # get the original documents from the retrieved documents which were split\n",
+ " original_retrieved_docs = []\n",
+ " for doc in retrieved_docs:\n",
+ " original_docs = []\n",
+ " for split_doc in doc:\n",
+ " for original_doc in ground_truth_docs:\n",
+ " if split_doc.meta[\"name\"] == original_doc[0].meta[\"name\"]:\n",
+ " original_docs.append(original_doc[0])\n",
+ " original_retrieved_docs.append(original_docs)\n",
+ "\n",
+ " eval_pipeline_results = eval_pipeline.run(\n",
+ " {\n",
+ " \"doc_mrr\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
+ " \"sas\": {\"predicted_answers\": predicted_answers, \"ground_truth_answers\": ground_truth_answers},\n",
+ " \"answer_exact\": {\"predicted_answers\": predicted_answers, \"ground_truth_answers\": ground_truth_answers},\n",
+ " \"doc_map\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
+ " \"doc_recall_single_hit\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
+ " \"doc_recall_multi_hit\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs}\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " results = {\n",
+ " \"doc_mrr\": eval_pipeline_results['doc_mrr'],\n",
+ " \"sas\": eval_pipeline_results['sas'],\n",
+ " \"doc_map\": eval_pipeline_results['doc_map'],\n",
+ " \"doc_recall_single_hit\": eval_pipeline_results['doc_recall_single_hit'],\n",
+ " \"doc_recall_multi_hit\": eval_pipeline_results['doc_recall_multi_hit']\n",
+ " }\n",
+ "\n",
+ " inputs = {'questions': questions, 'true_answers': ground_truth_answers, 'predicted_answers': predicted_answers}\n",
+ "\n",
+ " return results, inputs\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "5843fc9a-1100-4363-9bef-08039d85ed58",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def parameter_tuning(queries, documents):\n",
+ " \"\"\"\n",
+ " Run the basic RAG model with different parameters, and evaluate the results.\n",
+ "\n",
+ " The parameters to be tuned are: embedding model, top_k, and chunk_size.\n",
+ " \"\"\"\n",
+ " embedding_models = {\n",
+ " \"sentence-transformers/all-MiniLM-L6-v2\",\n",
+ " \"sentence-transformers/msmarco-distilroberta-base-v2\",\n",
+ " \"sentence-transformers/all-mpnet-base-v2\"\n",
+ " }\n",
+ " top_k_values = [1, 2, 3]\n",
+ " chunk_sizes = [5, 10, 15]\n",
+ "\n",
+ " # create results directory if it does not exist using Pathlib\n",
+ " out_path = Path(\"squad_results\")\n",
+ " out_path.mkdir(exist_ok=True)\n",
+ "\n",
+ " questions = []\n",
+ " ground_truth_answers = []\n",
+ " ground_truth_docs = []\n",
+ " for sample in queries:\n",
+ " questions.append(sample[\"question\"])\n",
+ " ground_truth_answers.append(sample[\"answers\"][\"text\"][0])\n",
+ " ground_truth_docs.append([doc for doc in documents if doc.meta[\"name\"] == sample[\"document\"]])\n",
+ "\n",
+ " for embedding_model in embedding_models:\n",
+ " for top_k in top_k_values:\n",
+ " for chunk_size in chunk_sizes:\n",
+ " name_params = f\"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}\"\n",
+ " print(name_params)\n",
+ " print(\"Indexing documents\")\n",
+ " doc_store = indexing(documents, embedding_model, chunk_size)\n",
+ " print(\"Running RAG pipeline\")\n",
+ " retrieved_docs, predicted_answers, retrieved_contexts = run_basic_rag(\n",
+ " doc_store, questions, embedding_model, top_k\n",
+ " )\n",
+ " print(f\"Running evaluation\")\n",
+ " results, inputs = run_evaluation(\n",
+ " embedding_model, ground_truth_docs, retrieved_docs, questions, predicted_answers,\n",
+ " ground_truth_answers\n",
+ " )\n",
+ " eval_results = EvaluationRunResult(run_name=name_params, inputs=inputs, results=results)\n",
+ " eval_results.score_report().to_csv(f\"{out_path}/score_report_{name_params}.csv\")\n",
+ " eval_results.to_pandas().to_csv(f\"{out_path}/detailed_{name_params}.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "df58ac50-035c-4efb-8de3-7f32b3e1a27f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "random.seed(42)\n",
+ "all_queries, documents = load_transformed_squad()\n",
+ "queries = random.sample(all_queries, 100) # take a sample of 100 questions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c0593452-176b-4ba2-b923-3e5bef45d9c9",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}