WIP

deepset-ai · May 13, 2024 · fe25ad7 · fe25ad7
1 parent a39e671
commit fe25ad7
Show file tree

Hide file tree

Showing 7 changed files with 211 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,18 @@
 # haystack-evaluation
 Using Haystack to benchmark different RAG architectures over different datasets
+
+
+Use evaluation on the selected datasets to optimise some parameters commonly tweaked in RAG pipelines:
+
+- top_k
+- chunk_size
+- embedding model
+
+
+goal number 1 is to give user practical guidance on what techniques to try out on their dataset/use case
+
+goal number 2 is to show that there is not a “silver bullet” type of solution, that it depends on the dataset and use case, but that Haystack can support them all
+
+goal number 3 is to showcase advanced evaluation/experimentation API (most advanced compared to competitors)
+
+it’s not a research paper, so should not be too “academic” (i.e. not too restricted in terms of metrics or datasets to use, not meant to be peer-reviewed or submitted to an academic conference)
diff --git a/evaluations/arago_evaluation.py → arago_evaluation.py b/evaluations/arago_evaluation.py → arago_evaluation.py
@@ -17,7 +17,7 @@
 from architectures.hyde_rag import rag_with_hyde
 
 embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-files_path = "../datasets/ARAGOG/papers_for_questions"
+files_path = "datasets/ARAGOG/papers_for_questions"
 
 
 def indexing():
@@ -39,7 +39,7 @@ def indexing():
 
 
 def read_question_answers():
- with open("../datasets/ARAGOG/eval_questions.json", "r") as f:
+ with open("datasets/ARAGOG/eval_questions.json", "r") as f:
  data = json.load(f)
  questions = data["questions"]
  answers = data["ground_truths"]
@@ -69,7 +69,6 @@ def run_basic_rag(doc_store, sample_questions, sample_answers):
  "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
  "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
  "sas": sas.run(predicted_answers, sample_answers),
- 'predicted_answers': predicted_answers,
  }
  inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
 
@@ -93,7 +92,6 @@ def run_hyde_rag(doc_store, sample_questions, sample_answers):
  sas = SASEvaluator(model=embedding_model)
  sas.warm_up()
  results = {
- 'predicted_answers': predicted_answers,
  "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
  "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
  "sas": sas.run(predicted_answers, sample_answers)

diff --git a/architectures/custom_splitter.py b/architectures/custom_splitter.py
@@ -0,0 +1,88 @@
+from copy import deepcopy
+from typing import List, Tuple, Dict
+
+from haystack import component, Document
+from more_itertools import windowed
+
+
+@component
+class CustomDocumentSplitter:
+
+ def __init__(
+ self,
+ split_length: int = 200,
+ split_overlap: int = 0,
+ ):
+
+ self.split_by = "\n"
+ if split_length <= 0:
+ raise ValueError("split_length must be greater than 0.")
+ self.split_length = split_length
+ if split_overlap < 0:
+ raise ValueError("split_overlap must be greater than or equal to 0.")
+ self.split_overlap = split_overlap
+
+ @component.output_types(documents=List[Document])
+ def run(self, documents: List[Document]):
+
+ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+ raise TypeError("DocumentSplitter expects a List of Documents as input.")
+
+ split_docs = []
+ for doc in documents:
+ if doc.content is None:
+ raise ValueError(
+ f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
+ )
+ units = self._split_into_units(doc.content)
+ text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
+ metadata = deepcopy(doc.meta)
+ metadata["source_id"] = doc.id
+ split_docs += self._create_docs_from_splits(
+ text_splits=text_splits, splits_pages=splits_pages, meta=metadata
+ )
+ return {"documents": split_docs}
+
+ def _split_into_units(self, text: str) -> List[str]:
+ split_at = "\n"
+ units = text.split(split_at)
+ # Add the delimiter back to all units except the last one
+ for i in range(len(units) - 1):
+ units[i] += split_at
+ return units
+
+ def _concatenate_units(
+ self, elements: List[str], split_length: int, split_overlap: int
+ ) -> Tuple[List[str], List[int]]:
+
+ text_splits = []
+ splits_pages = []
+ cur_page = 1
+ segments = windowed(elements, n=split_length, step=split_length - split_overlap)
+ for seg in segments:
+ current_units = [unit for unit in seg if unit is not None]
+ txt = "".join(current_units)
+ if len(txt) > 0:
+ text_splits.append(txt)
+ splits_pages.append(cur_page)
+ processed_units = current_units[: split_length - split_overlap]
+ if self.split_by == "page":
+ num_page_breaks = len(processed_units)
+ else:
+ num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+ cur_page += num_page_breaks
+ return text_splits, splits_pages
+
+ @staticmethod
+ def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+ """
+ Creates Document objects from text splits enriching them with page number and the metadata of the original document.
+ """
+ documents: List[Document] = []
+
+ for i, txt in enumerate(text_splits):
+ meta = deepcopy(meta)
+ doc = Document(content=txt, meta=meta)
+ doc.meta["page_number"] = splits_pages[i]
+ documents.append(doc)
+ return documents
diff --git a/datasets/datasets.md b/datasets/datasets.md
@@ -1,10 +1,7 @@
 # Datasets
 
 ToDo:
-- at least one should be financial or legal and raw data needs to be in structured pdfs
 - at least one should be about support/help centre 
-- there should be one that has been used in other benchmarks (maybe based on wikipedia)
-
 
 ## SQuAD
 
@@ -23,4 +20,13 @@ ToDo:
 - data type: PDF files
 - source: https://github.com/predlico/ARAGOG
 - paper: [ARAGOG: Advanced RAG Output Grading](https://arxiv.org/pdf/2404.01037)
+- evaluation: [ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity]()
+
+
+## Mini ESG Bench
+- domain: financial
+- labels: answer, context
+- data type: PDF files
+- source:
+- paper: [Mini ESG Bench Dataset](https://arxiv.org/abs/2404.01037) 
 - evaluation: [ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity]()
diff --git a/evaluations/evaluation.md b/evaluations/evaluation.md
diff --git a/mini_esg_evaluation.py b/mini_esg_evaluation.py
@@ -0,0 +1,95 @@
+import json
+import os
+import random
+
+from haystack import Pipeline, component
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.converters import PyPDFToDocument
+from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator
+from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.types import DuplicatePolicy
+from haystack.evaluation import EvaluationRunResult
+from tqdm import tqdm
+
+from architectures.basic_rag import basic_rag
+
+embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+files_path = "datasets/MiniESGBench/"
+
+
+def indexing():
+ document_store = InMemoryDocumentStore()
+ pipeline = Pipeline()
+ pipeline.add_component("converter", PyPDFToDocument())
+ pipeline.add_component("cleaner", DocumentCleaner())
+ pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=128))
+ pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
+ pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
+ pipeline.connect("converter", "cleaner")
+ pipeline.connect("cleaner", "splitter")
+ pipeline.connect("splitter", "embedder")
+ pipeline.connect("embedder", "writer")
+ pdf_files = [files_path+"source_files/"+f_name for f_name in os.listdir(files_path+"source_files/")]
+ pipeline.run({"converter": {"sources": pdf_files}})
+
+ return document_store
+
+
+def read_question_answers():
+ with open(files_path+"/rag_dataset.json", "r") as f:
+ data = json.load(f)
+ questions = []
+ contexts = []
+ answers = []
+ for entry in data['examples']:
+ questions.append(entry['query'])
+ contexts.append(entry['reference_contexts'])
+ answers.append(entry['reference_answer'])
+
+ return questions, contexts, answers
+
+
+def run_basic_rag(doc_store, questions_sample, answers_sample, contexts_sample):
+ """
+ A function to run the basic rag model on a set of sample questions and answers
+ """
+
+ rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=2)
+
+ predicted_answers = []
+ retrieved_contexts = []
+ for q in tqdm(questions_sample):
+ response = rag.run(
+ data={"query_embedder": {"text": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}})
+ predicted_answers.append(response["answer_builder"]["answers"][0].data)
+ retrieved_contexts.append([d.content for d in response['answer_builder']['answers'][0].documents])
+
+ context_relevance = ContextRelevanceEvaluator()
+ faithfulness = FaithfulnessEvaluator()
+ sas = SASEvaluator(model=embedding_model)
+ sas.warm_up()
+ results = {
+ "context_relevance": context_relevance.run(questions_sample, retrieved_contexts),
+ "faithfulness": faithfulness.run(questions_sample, retrieved_contexts, predicted_answers),
+ "sas": sas.run(predicted_answers, answers_sample),
+ }
+ inputs = {'questions': questions_sample, "true_answers": answers_sample, "predicted_answers": predicted_answers}
+
+ return EvaluationRunResult(run_name="basic_rag", inputs=inputs, results=results)
+
+
+def main():
+ doc_store = indexing()
+ questions, contexts, answers = read_question_answers()
+
+ limit = 5
+ questions_sample = random.sample(questions, limit)
+ contexts_sample = random.sample(contexts, limit)
+ answers_sample = random.sample(answers, limit)
+
+ basic_rag_results = run_basic_rag(doc_store, questions_sample, answers_sample, contexts_sample)
+
+
+
diff --git a/evaluations/squad_evaluation.py → squad_evaluation.py b/evaluations/squad_evaluation.py → squad_evaluation.py
@@ -23,7 +23,7 @@
 from architectures.hyde_rag import rag_with_hyde
 
 embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-base_path = "../datasets/SQuAD-2.0/transformed_squad/"
+base_path = "datasets/SQuAD-2.0/transformed_squad/"
 
 
 def load_transformed_squad():