diff --git a/README.md b/README.md
index 3e421f2..9e5633c 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,18 @@
 # haystack-evaluation
 Using Haystack to benchmark different RAG architectures over different datasets
+
+
+Use evaluation on the selected datasets to optimise some parameters commonly tweaked in RAG pipelines:
+
+- top_k
+- chunk_size
+- embedding model
+
+
+goal number 1 is to give user practical guidance on what techniques to try out on their dataset/use case
+
+goal number 2 is to show that there is not a “silver bullet” type of solution, that it depends on the dataset and use case, but that Haystack can support them all
+
+goal number 3 is to showcase advanced evaluation/experimentation API (most advanced compared to competitors)
+
+it’s not a research paper, so should not be too “academic” (i.e. not too restricted in terms of metrics or datasets to use, not meant to be peer-reviewed or submitted to an academic conference)
\ No newline at end of file
diff --git a/evaluations/arago_evaluation.py b/arago_evaluation.py
similarity index 95%
rename from evaluations/arago_evaluation.py
rename to arago_evaluation.py
index 1a24fce..61f66ae 100644
--- a/evaluations/arago_evaluation.py
+++ b/arago_evaluation.py
@@ -17,7 +17,7 @@
 from architectures.hyde_rag import rag_with_hyde
 
 embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-files_path = "../datasets/ARAGOG/papers_for_questions"
+files_path = "datasets/ARAGOG/papers_for_questions"
 
 
 def indexing():
@@ -39,7 +39,7 @@ def indexing():
 
 
 def read_question_answers():
-    with open("../datasets/ARAGOG/eval_questions.json", "r") as f:
+    with open("datasets/ARAGOG/eval_questions.json", "r") as f:
         data = json.load(f)
         questions = data["questions"]
         answers = data["ground_truths"]
@@ -69,7 +69,6 @@ def run_basic_rag(doc_store, sample_questions, sample_answers):
         "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
         "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
         "sas": sas.run(predicted_answers, sample_answers),
-        'predicted_answers': predicted_answers,
     }
     inputs = {'questions': sample_questions, "true_answers": sample_answers, "predicted_answers": predicted_answers}
 
@@ -93,7 +92,6 @@ def run_hyde_rag(doc_store, sample_questions, sample_answers):
     sas = SASEvaluator(model=embedding_model)
     sas.warm_up()
     results = {
-        'predicted_answers': predicted_answers,
         "context_relevance": context_relevance.run(sample_questions, retrieved_contexts),
         "faithfulness": faithfulness.run(sample_questions, retrieved_contexts, predicted_answers),
         "sas": sas.run(predicted_answers, sample_answers)
diff --git a/architectures/custom_splitter.py b/architectures/custom_splitter.py
new file mode 100644
index 0000000..831b8b3
--- /dev/null
+++ b/architectures/custom_splitter.py
@@ -0,0 +1,88 @@
+from copy import deepcopy
+from typing import List, Tuple, Dict
+
+from haystack import component, Document
+from more_itertools import windowed
+
+
+@component
+class CustomDocumentSplitter:
+
+    def __init__(
+        self,
+        split_length: int = 200,
+        split_overlap: int = 0,
+    ):
+
+        self.split_by = "\n"
+        if split_length <= 0:
+            raise ValueError("split_length must be greater than 0.")
+        self.split_length = split_length
+        if split_overlap < 0:
+            raise ValueError("split_overlap must be greater than or equal to 0.")
+        self.split_overlap = split_overlap
+
+    @component.output_types(documents=List[Document])
+    def run(self, documents: List[Document]):
+
+        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+            raise TypeError("DocumentSplitter expects a List of Documents as input.")
+
+        split_docs = []
+        for doc in documents:
+            if doc.content is None:
+                raise ValueError(
+                    f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
+                )
+            units = self._split_into_units(doc.content)
+            text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
+            metadata = deepcopy(doc.meta)
+            metadata["source_id"] = doc.id
+            split_docs += self._create_docs_from_splits(
+                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
+            )
+        return {"documents": split_docs}
+
+    def _split_into_units(self, text: str) -> List[str]:
+        split_at = "\n"
+        units = text.split(split_at)
+        # Add the delimiter back to all units except the last one
+        for i in range(len(units) - 1):
+            units[i] += split_at
+        return units
+
+    def _concatenate_units(
+        self, elements: List[str], split_length: int, split_overlap: int
+    ) -> Tuple[List[str], List[int]]:
+
+        text_splits = []
+        splits_pages = []
+        cur_page = 1
+        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
+        for seg in segments:
+            current_units = [unit for unit in seg if unit is not None]
+            txt = "".join(current_units)
+            if len(txt) > 0:
+                text_splits.append(txt)
+                splits_pages.append(cur_page)
+                processed_units = current_units[: split_length - split_overlap]
+                if self.split_by == "page":
+                    num_page_breaks = len(processed_units)
+                else:
+                    num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+                cur_page += num_page_breaks
+        return text_splits, splits_pages
+
+    @staticmethod
+    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+        """
+        Creates Document objects from text splits enriching them with page number and the metadata of the original document.
+        """
+        documents: List[Document] = []
+
+        for i, txt in enumerate(text_splits):
+            meta = deepcopy(meta)
+            doc = Document(content=txt, meta=meta)
+            doc.meta["page_number"] = splits_pages[i]
+            documents.append(doc)
+        return documents
diff --git a/datasets/datasets.md b/datasets/datasets.md
index 8468679..0218787 100644
--- a/datasets/datasets.md
+++ b/datasets/datasets.md
@@ -1,10 +1,7 @@
 # Datasets
 
 ToDo:
-- at least one should be financial or legal and raw data needs to be in structured pdfs
 - at least one should be about support/help centre 
-- there should be one that has been used in other benchmarks (maybe based on wikipedia)
-
 
 ## SQuAD
 
@@ -23,4 +20,13 @@ ToDo:
 - data type: PDF files
 - source: https://github.com/predlico/ARAGOG
 - paper: [ARAGOG: Advanced RAG Output Grading](https://arxiv.org/pdf/2404.01037)
+- evaluation: [ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity]()
+
+
+## Mini ESG Bench
+- domain: financial
+- labels: answer, context
+- data type: PDF files
+- source:
+- paper: [Mini ESG Bench Dataset](https://arxiv.org/abs/2404.01037) 
 - evaluation: [ContextRelevance](), [Faithfulness](), [Semantic Answer Similarity]()
\ No newline at end of file
diff --git a/evaluations/evaluation.md b/evaluations/evaluation.md
deleted file mode 100644
index e69de29..0000000
diff --git a/mini_esg_evaluation.py b/mini_esg_evaluation.py
new file mode 100644
index 0000000..a0890d2
--- /dev/null
+++ b/mini_esg_evaluation.py
@@ -0,0 +1,95 @@
+import json
+import os
+import random
+
+from haystack import Pipeline, component
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.converters import PyPDFToDocument
+from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator
+from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.types import DuplicatePolicy
+from haystack.evaluation import EvaluationRunResult
+from tqdm import tqdm
+
+from architectures.basic_rag import basic_rag
+
+embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+files_path = "datasets/MiniESGBench/"
+
+
+def indexing():
+    document_store = InMemoryDocumentStore()
+    pipeline = Pipeline()
+    pipeline.add_component("converter", PyPDFToDocument())
+    pipeline.add_component("cleaner", DocumentCleaner())
+    pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=128))
+    pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
+    pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
+    pipeline.connect("converter", "cleaner")
+    pipeline.connect("cleaner", "splitter")
+    pipeline.connect("splitter", "embedder")
+    pipeline.connect("embedder", "writer")
+    pdf_files = [files_path+"source_files/"+f_name for f_name in os.listdir(files_path+"source_files/")]
+    pipeline.run({"converter": {"sources": pdf_files}})
+
+    return document_store
+
+
+def read_question_answers():
+    with open(files_path+"/rag_dataset.json", "r") as f:
+        data = json.load(f)
+        questions = []
+        contexts = []
+        answers = []
+        for entry in data['examples']:
+            questions.append(entry['query'])
+            contexts.append(entry['reference_contexts'])
+            answers.append(entry['reference_answer'])
+
+    return questions, contexts, answers
+
+
+def run_basic_rag(doc_store, questions_sample, answers_sample, contexts_sample):
+    """
+    A function to run the basic rag model on a set of sample questions and answers
+    """
+
+    rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=2)
+
+    predicted_answers = []
+    retrieved_contexts = []
+    for q in tqdm(questions_sample):
+        response = rag.run(
+            data={"query_embedder": {"text": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}})
+        predicted_answers.append(response["answer_builder"]["answers"][0].data)
+        retrieved_contexts.append([d.content for d in response['answer_builder']['answers'][0].documents])
+
+    context_relevance = ContextRelevanceEvaluator()
+    faithfulness = FaithfulnessEvaluator()
+    sas = SASEvaluator(model=embedding_model)
+    sas.warm_up()
+    results = {
+        "context_relevance": context_relevance.run(questions_sample, retrieved_contexts),
+        "faithfulness": faithfulness.run(questions_sample, retrieved_contexts, predicted_answers),
+        "sas": sas.run(predicted_answers, answers_sample),
+    }
+    inputs = {'questions': questions_sample, "true_answers": answers_sample, "predicted_answers": predicted_answers}
+
+    return EvaluationRunResult(run_name="basic_rag", inputs=inputs, results=results)
+
+
+def main():
+    doc_store = indexing()
+    questions, contexts, answers = read_question_answers()
+
+    limit = 5
+    questions_sample = random.sample(questions, limit)
+    contexts_sample = random.sample(contexts, limit)
+    answers_sample = random.sample(answers, limit)
+
+    basic_rag_results = run_basic_rag(doc_store, questions_sample, answers_sample, contexts_sample)
+
+
+
diff --git a/evaluations/squad_evaluation.py b/squad_evaluation.py
similarity index 99%
rename from evaluations/squad_evaluation.py
rename to squad_evaluation.py
index e179fb4..ad22fad 100644
--- a/evaluations/squad_evaluation.py
+++ b/squad_evaluation.py
@@ -23,7 +23,7 @@
 from architectures.hyde_rag import rag_with_hyde
 
 embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-base_path = "../datasets/SQuAD-2.0/transformed_squad/"
+base_path = "datasets/SQuAD-2.0/transformed_squad/"
 
 
 def load_transformed_squad():