Skip to content

Commit

Permalink
cleaning and fixing aragog script and adding squad rag notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed May 30, 2024
1 parent 993514a commit 62c3f6c
Show file tree
Hide file tree
Showing 2 changed files with 298 additions and 16 deletions.
66 changes: 50 additions & 16 deletions evaluations/evaluation_aragog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import json
import os
import random
from pathlib import Path
from typing import Tuple, List

Expand All @@ -17,12 +19,15 @@
from tqdm import tqdm

from architectures.basic_rag import basic_rag
from utils import timeit
from utils.utils import timeit

base_path = "../datasets/ARAGOG/"


@timeit
def indexing(embedding_model: str, chunk_size: int):
files_path = "datasets/ARAGOG/papers_for_questions"
full_path = Path(base_path)
files_path = full_path / "papers_for_questions"
document_store = InMemoryDocumentStore()
pipeline = Pipeline()
pipeline.add_component("converter", PyPDFToDocument())
Expand All @@ -34,14 +39,14 @@ def indexing(embedding_model: str, chunk_size: int):
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
pdf_files = [files_path+"/"+f_name for f_name in os.listdir(files_path)]
pdf_files = [full_path / "papers_for_questions" / f_name for f_name in os.listdir(files_path)]
pipeline.run({"converter": {"sources": pdf_files}})

return document_store


def read_question_answers() -> Tuple[List[str], List[str]]:
with open("datasets/ARAGOG/eval_questions.json", "r") as f:
with open (base_path + "eval_questions.json", "r") as f:
data = json.load(f)
questions = data["questions"]
answers = data["ground_truths"]
Expand Down Expand Up @@ -75,23 +80,33 @@ def run_basic_rag(doc_store, sample_questions, embedding_model, top_k):

@timeit
def run_evaluation(sample_questions, sample_answers, retrieved_contexts, predicted_answers, embedding_model):
context_relevance = ContextRelevanceEvaluator(raise_on_failure=False)
faithfulness = FaithfulnessEvaluator(raise_on_failure=False)
sas = SASEvaluator(model=embedding_model)
sas.warm_up()
eval_pipeline = Pipeline()
eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False))
eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False))
eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))

eval_pipeline_results = eval_pipeline.run(
{
"context_relevance": {"questions": sample_questions, "contexts": retrieved_contexts},
"faithfulness": {
"questions": sample_questions, "contexts": retrieved_contexts, "predicted_answers": predicted_answers
},
"sas": {"predicted_answers": predicted_answers, "ground_truth_answers": sample_answers},
}
)

results = {
"context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
"faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
"sas": sas.run(predicted_answers, sample_answers),
"context_relevance": eval_pipeline_results['context_relevance'],
"faithfulness": eval_pipeline_results['faithfulness'],
"sas": eval_pipeline_results['sas']
}

inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
inputs = {'questions': sample_questions, 'true_answers': sample_answers, 'predicted_answers': predicted_answers}

return results, inputs


def parameter_tuning(questions, answers):
def parameter_tuning(questions, answers, out_path: str):
"""
Run the basic RAG model with different parameters, and evaluate the results.
Expand All @@ -106,12 +121,12 @@ def parameter_tuning(questions, answers):
chunk_sizes = [64, 128, 256]

# create results directory if it does not exist using Pathlib
out_path = Path("aragog_results")
out_path = Path(out_path)
out_path.mkdir(exist_ok=True)

for embedding_model in embedding_models:
for chunk_size in chunk_sizes:
print("Indexing documents")
print(f"Indexing documents with {embedding_model} model with a chunk_size={chunk_size}")
doc_store = indexing(embedding_model, chunk_size)
for top_k in top_k_values:
name_params = f"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}"
Expand All @@ -125,9 +140,28 @@ def parameter_tuning(questions, answers):
eval_results.to_pandas().to_csv(f"{out_path}/detailed_{name_params}.csv", index=False)


def create_args():
parser = argparse.ArgumentParser(description='Run the ARAGOG dataset evaluation on a RAG pipeline')
parser.add_argument(
'--output_dir',
type=str,
help='The output directory for the results',
required=True
)
parser.add_argument('--sample', type=int, help='The number of questions to sample')
return parser.parse_args()


def main():
args = create_args()
questions, answers = read_question_answers()
parameter_tuning(questions, answers)

if args.sample:
random.seed(42)
questions = random.sample(questions, args.sample)
answers = random.sample(answers, args.sample)

parameter_tuning(questions, answers, args.output_dir)


if __name__ == '__main__':
Expand Down
248 changes: 248 additions & 0 deletions evaluations/evaluation_squad_rag.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "3066ab55-72ac-4669-b303-8482877149f2",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import random\n",
"\n",
"from haystack import Document\n",
"\n",
"base_path = \"datasets/SQuAD-2.0/transformed_squad/\"\n",
"\n",
"def load_transformed_squad():\n",
" with open(base_path+\"questions.jsonl\", \"r\") as f:\n",
" questions = [json.loads(x) for x in f.readlines()]\n",
" for idx, question in enumerate(questions):\n",
" question[\"query_id\"] = f\"query_{idx}\"\n",
"\n",
" def create_document(text: str, name: str):\n",
" return Document(content=text, meta={\"name\": name})\n",
"\n",
" documents = []\n",
" for root, dirs, files in os.walk(base_path+\"articles\"):\n",
" for article in files:\n",
" with open(f\"{root}/{article}\", \"r\") as f:\n",
" article_text = f.read()\n",
" documents.append(create_document(article_text, article.replace(\".txt\", \"\")))\n",
"\n",
" return questions, documents"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b0efeffd-0256-470b-8467-2680795aa326",
"metadata": {},
"outputs": [],
"source": [
"from haystack import Pipeline, Document\n",
"from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
"from haystack.document_stores.types import DuplicatePolicy\n",
"from haystack.components.evaluators.document_recall import RecallMode\n",
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
"from haystack.components.preprocessors import DocumentSplitter\n",
"from haystack.components.writers import DocumentWriter\n",
"\n",
"def indexing(documents, embedding_model, chunk_size):\n",
" document_store = InMemoryDocumentStore()\n",
" doc_splitter = DocumentSplitter(split_by=\"sentence\", split_length=chunk_size)\n",
" doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n",
" doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)\n",
" ingestion_pipe = Pipeline()\n",
" ingestion_pipe.add_component(instance=doc_splitter, name=\"doc_splitter\")\n",
" ingestion_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
" ingestion_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
" ingestion_pipe.connect(\"doc_splitter.documents\", \"doc_embedder.documents\")\n",
" ingestion_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
" ingestion_pipe.run({\"doc_splitter\": {\"documents\": documents}})\n",
"\n",
" return document_store"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "257fb0bb-3d80-4b3c-b748-00ffa78e825a",
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"from architectures.basic_rag import basic_rag\n",
"\n",
"def run_basic_rag(doc_store, questions, embedding_model, top_k):\n",
"\n",
" rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)\n",
"\n",
" # predicted data\n",
" retrieved_docs = []\n",
" retrieved_contexts = []\n",
" predicted_answers = []\n",
"\n",
" for q in tqdm(questions):\n",
" response = rag.run(\n",
" data={\"query_embedder\": {\"text\": q},\n",
" \"prompt_builder\": {\"question\": q},\n",
" \"answer_builder\": {\"query\": q}}\n",
" )\n",
"\n",
" # gather response data\n",
" retrieved_docs.append(response[\"answer_builder\"][\"answers\"][0].documents)\n",
" retrieved_contexts.append([doc.content for doc in response[\"answer_builder\"][\"answers\"][0].documents])\n",
" predicted_answers.append(response[\"answer_builder\"][\"answers\"][0].data)\n",
"\n",
" return retrieved_docs, predicted_answers, retrieved_contexts\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e2a545b6-4fee-4277-b6dd-88675529ebb9",
"metadata": {},
"outputs": [],
"source": [
"def run_evaluation(embedding_model, ground_truth_docs, retrieved_docs, questions, predicted_answers, ground_truth_answers):\n",
" eval_pipeline = Pipeline()\n",
" eval_pipeline.add_component(\"doc_mrr\", DocumentMRREvaluator())\n",
" eval_pipeline.add_component(\"doc_map\", DocumentMAPEvaluator())\n",
" eval_pipeline.add_component(\"doc_recall_single_hit\", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))\n",
" eval_pipeline.add_component(\"doc_recall_multi_hit\", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))\n",
" eval_pipeline.add_component(\"answer_exact\", AnswerExactMatchEvaluator())\n",
" eval_pipeline.add_component(\"sas\", SASEvaluator(model=embedding_model))\n",
"\n",
" # get the original documents from the retrieved documents which were split\n",
" original_retrieved_docs = []\n",
" for doc in retrieved_docs:\n",
" original_docs = []\n",
" for split_doc in doc:\n",
" for original_doc in ground_truth_docs:\n",
" if split_doc.meta[\"name\"] == original_doc[0].meta[\"name\"]:\n",
" original_docs.append(original_doc[0])\n",
" original_retrieved_docs.append(original_docs)\n",
"\n",
" eval_pipeline_results = eval_pipeline.run(\n",
" {\n",
" \"doc_mrr\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
" \"sas\": {\"predicted_answers\": predicted_answers, \"ground_truth_answers\": ground_truth_answers},\n",
" \"answer_exact\": {\"predicted_answers\": predicted_answers, \"ground_truth_answers\": ground_truth_answers},\n",
" \"doc_map\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
" \"doc_recall_single_hit\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs},\n",
" \"doc_recall_multi_hit\": {\"ground_truth_documents\": ground_truth_docs, \"retrieved_documents\": original_retrieved_docs}\n",
" }\n",
" )\n",
"\n",
" results = {\n",
" \"doc_mrr\": eval_pipeline_results['doc_mrr'],\n",
" \"sas\": eval_pipeline_results['sas'],\n",
" \"doc_map\": eval_pipeline_results['doc_map'],\n",
" \"doc_recall_single_hit\": eval_pipeline_results['doc_recall_single_hit'],\n",
" \"doc_recall_multi_hit\": eval_pipeline_results['doc_recall_multi_hit']\n",
" }\n",
"\n",
" inputs = {'questions': questions, 'true_answers': ground_truth_answers, 'predicted_answers': predicted_answers}\n",
"\n",
" return results, inputs\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5843fc9a-1100-4363-9bef-08039d85ed58",
"metadata": {},
"outputs": [],
"source": [
"def parameter_tuning(queries, documents):\n",
" \"\"\"\n",
" Run the basic RAG model with different parameters, and evaluate the results.\n",
"\n",
" The parameters to be tuned are: embedding model, top_k, and chunk_size.\n",
" \"\"\"\n",
" embedding_models = {\n",
" \"sentence-transformers/all-MiniLM-L6-v2\",\n",
" \"sentence-transformers/msmarco-distilroberta-base-v2\",\n",
" \"sentence-transformers/all-mpnet-base-v2\"\n",
" }\n",
" top_k_values = [1, 2, 3]\n",
" chunk_sizes = [5, 10, 15]\n",
"\n",
" # create results directory if it does not exist using Pathlib\n",
" out_path = Path(\"squad_results\")\n",
" out_path.mkdir(exist_ok=True)\n",
"\n",
" questions = []\n",
" ground_truth_answers = []\n",
" ground_truth_docs = []\n",
" for sample in queries:\n",
" questions.append(sample[\"question\"])\n",
" ground_truth_answers.append(sample[\"answers\"][\"text\"][0])\n",
" ground_truth_docs.append([doc for doc in documents if doc.meta[\"name\"] == sample[\"document\"]])\n",
"\n",
" for embedding_model in embedding_models:\n",
" for top_k in top_k_values:\n",
" for chunk_size in chunk_sizes:\n",
" name_params = f\"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}\"\n",
" print(name_params)\n",
" print(\"Indexing documents\")\n",
" doc_store = indexing(documents, embedding_model, chunk_size)\n",
" print(\"Running RAG pipeline\")\n",
" retrieved_docs, predicted_answers, retrieved_contexts = run_basic_rag(\n",
" doc_store, questions, embedding_model, top_k\n",
" )\n",
" print(f\"Running evaluation\")\n",
" results, inputs = run_evaluation(\n",
" embedding_model, ground_truth_docs, retrieved_docs, questions, predicted_answers,\n",
" ground_truth_answers\n",
" )\n",
" eval_results = EvaluationRunResult(run_name=name_params, inputs=inputs, results=results)\n",
" eval_results.score_report().to_csv(f\"{out_path}/score_report_{name_params}.csv\")\n",
" eval_results.to_pandas().to_csv(f\"{out_path}/detailed_{name_params}.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "df58ac50-035c-4efb-8de3-7f32b3e1a27f",
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)\n",
"all_queries, documents = load_transformed_squad()\n",
"queries = random.sample(all_queries, 100) # take a sample of 100 questions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0593452-176b-4ba2-b923-3e5bef45d9c9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 62c3f6c

Please sign in to comment.