diff --git a/.gitignore b/.gitignore
index 68bc17f..5243b13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,3 +158,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# MacOS
+.DS_Store
+*/.DS_Store
diff --git a/datasets/datasets.md b/datasets/datasets.md
index 01bd0c7..8468679 100644
--- a/datasets/datasets.md
+++ b/datasets/datasets.md
@@ -2,24 +2,25 @@
 
 ToDo:
 - at least one should be financial or legal and raw data needs to be in structured pdfs
-- at least one should be about support/help centre
+- at least one should be about support/help centre 
 - there should be one that has been used in other benchmarks (maybe based on wikipedia)
-- they should all have a set of labels so that we can get performance metrics from them
 
 
 ## SQuAD
 
 - domain: wikipedia
 - labels: answer, documents
+- data type: text files
 - source: https://huggingface.co/datasets/squad
 - paper: [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250)
 - website: [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)
-- evaluation: 
+- evaluation:[ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity](), [DocumentMRR](), [DocumentMAP](), [DocumentRecall]()
 
 
 ## ARAGOG
 - domain: a collection of AI/LLM-ArXiv papers 
 - labels: answer
+- data type: PDF files
 - source: https://github.com/predlico/ARAGOG
 - paper: [ARAGOG: Advanced RAG Output Grading](https://arxiv.org/pdf/2404.01037)
 - evaluation: [ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity]()
\ No newline at end of file
diff --git a/arago_evaluation.py b/evaluations/arago_evaluation.py
similarity index 93%
rename from arago_evaluation.py
rename to evaluations/arago_evaluation.py
index 812d355..1a24fce 100644
--- a/arago_evaluation.py
+++ b/evaluations/arago_evaluation.py
@@ -17,7 +17,7 @@
 from architectures.hyde_rag import rag_with_hyde
 
 embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-files_path = "datasets/ARAGOG/papers_for_questions"
+files_path = "../datasets/ARAGOG/papers_for_questions"
 
 
 def indexing():
@@ -39,7 +39,7 @@ def indexing():
 
 
 def read_question_answers():
-    with open("datasets/ARAGOG/eval_questions.json", "r") as f:
+    with open("../datasets/ARAGOG/eval_questions.json", "r") as f:
         data = json.load(f)
         questions = data["questions"]
         answers = data["ground_truths"]
@@ -71,7 +71,7 @@ def run_basic_rag(doc_store, sample_questions, sample_answers):
         "sas": sas.run(predicted_answers, sample_answers),
         'predicted_answers': predicted_answers,
     }
-    inputs = {'questions': sample_questions}
+    inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
 
     return EvaluationRunResult(run_name="basic_rag", inputs=inputs, results=results)
 
@@ -98,7 +98,7 @@ def run_hyde_rag(doc_store, sample_questions, sample_answers):
         "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
         "sas": sas.run(predicted_answers, sample_answers)
     }
-    inputs = {'questions': sample_questions}
+    inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
 
     return EvaluationRunResult(run_name="hyde_rag", inputs=inputs, results=results)
 
diff --git a/evaluations/evaluation.md b/evaluations/evaluation.md
new file mode 100644
index 0000000..e69de29
diff --git a/evaluations/squad_evaluation.py b/evaluations/squad_evaluation.py
new file mode 100644
index 0000000..e179fb4
--- /dev/null
+++ b/evaluations/squad_evaluation.py
@@ -0,0 +1,204 @@
+import json
+import os
+import random
+from typing import List
+
+from haystack import Pipeline, Document
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.components.evaluators import (
+    DocumentMRREvaluator,
+    DocumentMAPEvaluator,
+    DocumentRecallEvaluator,
+    FaithfulnessEvaluator,
+    SASEvaluator
+)
+from haystack.components.evaluators.document_recall import RecallMode
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.document_stores.types import DuplicatePolicy
+from haystack.evaluation import EvaluationRunResult
+from tqdm import tqdm
+
+from architectures.basic_rag import basic_rag
+from architectures.hyde_rag import rag_with_hyde
+
+embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+base_path = "../datasets/SQuAD-2.0/transformed_squad/"
+
+
+def load_transformed_squad():
+    with open(base_path+"questions.jsonl", "r") as f:
+        questions = [json.loads(x) for x in f.readlines()]
+    for idx, question in enumerate(questions):
+        question["query_id"] = f"query_{idx}"
+
+    def create_document(text: str, name: str):
+        return Document(content=text, meta={"name": name})
+
+    # walk through the files in the directory and transform each line of each text file into a Document
+    documents = []
+    for root, dirs, files in os.walk(base_path):
+        for article in files:
+            with open(f"{root}/{article}", "r") as f:
+                raw_texts = f.read().split("\n")
+                for text in raw_texts:
+                    documents.append(create_document(text, article.replace(".txt", "")))
+
+    return questions, documents
+
+
+def indexing(documents: List[Document]):
+    document_store = InMemoryDocumentStore()
+    doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
+    doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)
+    ingestion_pipe = Pipeline()
+    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
+    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
+    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
+    ingestion_pipe.run({"doc_embedder": {"documents": documents}})
+
+    return document_store
+
+
+def run_basic_rag(doc_store, samples):
+
+    rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=3)
+
+    # ground truth data
+    questions = []
+    ground_truth_docs = []
+    ground_truth_answers = []
+
+    # predicted data
+    retrieved_docs = []
+    predicted_contexts = []
+    predicted_answers = []
+
+    for sample in tqdm(samples):
+        q = sample["question"]
+        answer = sample["answers"]["text"]
+        ground_truth_documents = [doc for doc in doc_store.storage.values() if doc.meta["name"] == sample["document"]]
+        response = rag.run(
+            data={"query_embedder": {"text": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}}
+        )
+
+        # gather ground truth data
+        ground_truth_docs.append(ground_truth_documents)
+        ground_truth_answers.append(answer[0])
+        questions.append(q)
+
+        # gather response data
+        retrieved_docs.append(response["answer_builder"]["answers"][0].documents)
+        predicted_contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
+        predicted_answers.append(response["answer_builder"]["answers"][0].data)
+
+    eval_pipeline = Pipeline()
+    eval_pipeline.add_component("doc_mrr", DocumentMRREvaluator())
+    eval_pipeline.add_component("doc_map", DocumentMAPEvaluator())
+    eval_pipeline.add_component("doc_recall_single_hit", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))
+    eval_pipeline.add_component("doc_recall_multi_hit", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))
+    eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator())
+    eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))
+
+    eval_pipeline_results = eval_pipeline.run(
+        {
+            "doc_mrr": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "faithfulness": {"questions": questions, "contexts": predicted_contexts, "predicted_answers": predicted_answers},
+            "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": ground_truth_answers},
+            "doc_map": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "doc_recall_single_hit": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "doc_recall_multi_hit": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs}
+        }
+    )
+
+    results = {        
+        "doc_mrr": eval_pipeline_results['doc_mrr'],
+        "faithfulness": eval_pipeline_results['faithfulness'],
+        "sas": eval_pipeline_results['sas'],
+        "doc_map": eval_pipeline_results['doc_map'],
+        "doc_recall_single_hit": eval_pipeline_results['doc_recall_single_hit'],
+        "doc_recall_multi_hit": eval_pipeline_results['doc_recall_multi_hit']
+    }
+
+    inputs = {'questions': questions, 'true_answers': ground_truth_answers, 'predicted_answers': predicted_answers}
+
+    return EvaluationRunResult(run_name="basic_rag", inputs=inputs, results=results)
+
+
+def run_hyde_rag(doc_store, samples):
+
+    hyde_rag = rag_with_hyde(document_store=doc_store, embedding_model=embedding_model, top_k=3)
+
+    # ground truth data
+    questions = []
+    ground_truth_docs = []
+    ground_truth_answers = []
+
+    # predicted data
+    retrieved_docs = []
+    predicted_contexts = []
+    predicted_answers = []
+
+    for sample in tqdm(samples):
+        q = sample["question"]
+        answer = sample["answers"]["text"]
+        ground_truth_documents = [doc for doc in doc_store.storage.values() if doc.meta["name"] == sample["document"]]
+        response = hyde_rag.run(
+            data={"hyde": {"query": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}}
+        )
+
+        # gather ground truth data
+        ground_truth_docs.append(ground_truth_documents)
+        ground_truth_answers.append(answer[0])
+        questions.append(q)
+
+        # gather response data
+        retrieved_docs.append(response["answer_builder"]["answers"][0].documents)
+        predicted_contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
+        predicted_answers.append(response["answer_builder"]["answers"][0].data)
+
+    eval_pipeline = Pipeline()
+    eval_pipeline.add_component("doc_mrr", DocumentMRREvaluator())
+    eval_pipeline.add_component("doc_map", DocumentMAPEvaluator())
+    eval_pipeline.add_component("doc_recall_single_hit", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))
+    eval_pipeline.add_component("doc_recall_multi_hit", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))
+    eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator())
+    eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))
+
+    eval_pipeline_results = eval_pipeline.run(
+        {
+            "doc_mrr": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "faithfulness": {"questions": questions, "contexts": predicted_contexts, "predicted_answers": predicted_answers},
+            "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": ground_truth_answers},
+            "doc_map": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "doc_recall_single_hit": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs},
+            "doc_recall_multi_hit": {"ground_truth_documents": ground_truth_docs, "retrieved_documents": retrieved_docs}
+        }
+    )
+
+    results = {
+        "doc_mrr": eval_pipeline_results['doc_mrr'],
+        "faithfulness": eval_pipeline_results['faithfulness'],
+        "sas": eval_pipeline_results['sas'],
+        "doc_map": eval_pipeline_results['doc_map'],
+        "doc_recall_single_hit": eval_pipeline_results['doc_recall_single_hit'],
+        "doc_recall_multi_hit": eval_pipeline_results['doc_recall_multi_hit']
+    }
+
+    inputs = {'questions': questions, 'true_answers': ground_truth_answers, 'predicted_answers': predicted_answers}
+
+    return EvaluationRunResult(run_name="hyde_rag", inputs=inputs, results=results)
+
+
+def main():
+
+    all_questions, documents = load_transformed_squad()
+    doc_store = indexing(documents)
+
+    limit = 10
+    samples = random.sample(all_questions, limit)
+
+    basic_rag_results = run_basic_rag(doc_store, samples)
+    hyde_rag_results = run_hyde_rag(doc_store, samples)
+
+    comparative_df = basic_rag_results.comparative_individual_scores_report(hyde_rag_results)
diff --git a/squad_evaluation.py b/squad_evaluation.py
deleted file mode 100644
index 1f30849..0000000
--- a/squad_evaluation.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import json
-import os
-from typing import List
-
-from haystack import Pipeline, Document
-from haystack.components.embedders import SentenceTransformersDocumentEmbedder
-from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator
-from haystack.components.writers import DocumentWriter
-from haystack.document_stores.types import DuplicatePolicy
-from haystack.evaluation import EvaluationRunResult
-from tqdm import tqdm
-
-from architectures.basic_rag import basic_rag
-from architectures.hyde_rag import rag_with_hyde
-
-embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-base_path = "datasets/SQuAD-2.0/transformed_squad/"
-
-
-def load_transformed_squad():
-    with open(base_path+"questions.jsonl", "r") as f:
-        questions = [json.loads(x) for x in f.readlines()]
-    for idx, question in enumerate(questions):
-        question["query_id"] = f"query_{idx}"
-
-    def create_document(text: str, name: str):
-        return Document(content=text, meta={"name": name})
-
-    # walk through the files in the directory and transform each line of each text file into a Document
-    documents = []
-    for root, dirs, files in os.walk(base_path):
-        for article in files:
-            with open(f"{root}/{article}", "r") as f:
-                raw_texts = f.read().split("\n")
-                for text in raw_texts:
-                    documents.append(create_document(text, article.replace(".txt", "")))
-
-    return questions, documents
-
-
-def indexing(documents: List[Document]):
-    document_store = InMemoryDocumentStore()
-    doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
-    doc_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)
-    ingestion_pipe = Pipeline()
-    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
-    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
-    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
-    ingestion_pipe.run({"doc_embedder": {"documents": documents}})
-
-    return document_store
-
-
-def run_basic_rag(doc_store, sample_questions, sample_answers):
-    """
-    A function to run the basic rag model on a set of sample questions and answers
-    """
-
-    rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=3)
-
-    predicted_answers = []
-    retrieved_contexts = []
-    for q in tqdm(sample_questions):
-        response = rag.run(
-            data={"query_embedder": {"text": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}})
-        predicted_answers.append(response["answer_builder"]["answers"][0].data)
-        retrieved_contexts.append([d.content for d in response['answer_builder']['answers'][0].documents])
-
-    context_relevance = ContextRelevanceEvaluator()
-    faithfulness = FaithfulnessEvaluator()
-    sas = SASEvaluator(model=embedding_model)
-    sas.warm_up()
-    results = {
-        "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
-        "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
-        "sas": sas.run(predicted_answers, sample_answers)
-    }
-    inputs = {'questions': sample_questions, 'answers': predicted_answers}
-
-    return EvaluationRunResult(run_name="basic_rag", inputs=inputs, results=results)
-
-
-def run_hyde_rag(doc_store, sample_questions, sample_answers):
-
-    hyde_rag = rag_with_hyde(document_store=doc_store, embedding_model=embedding_model, top_k=3)
-
-    predicted_answers = []
-    retrieved_contexts = []
-    for q in tqdm(sample_questions):
-        response = hyde_rag.run(
-            data={"hyde": {"query": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}})
-        predicted_answers.append(response["answer_builder"]["answers"][0].data)
-        retrieved_contexts.append([d.content for d in response['answer_builder']['answers'][0].documents])
-
-    context_relevance = ContextRelevanceEvaluator()
-    faithfulness = FaithfulnessEvaluator()
-    sas = SASEvaluator(model=embedding_model)
-    sas.warm_up()
-    results = {
-        "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
-        "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
-        "sas": sas.run(predicted_answers, sample_answers)
-    }
-    inputs = {'questions': sample_questions, 'predicted_answers': predicted_answers}
-
-    return EvaluationRunResult(run_name="hyde_rag", inputs=inputs, results=results)
-
-
-def main():
-
-    questions, documents = load_transformed_squad()
-    doc_store = indexing(documents)
-
-    limit = 5
-    questions = questions[0:limit]
-    sample_ground_truth_answers = ground_truth_answers[0:limit]
-
-    basic_rag_results = run_basic_rag(doc_store, sample_questions, sample_ground_truth_answers)
-    hyde_rag_results = run_hyde_rag(doc_store, sample_questions, sample_ground_truth_answers)
-
-    comparative_df = basic_rag_results.comparative_individual_scores_report(hyde_rag_results)
-