diff --git a/config.toml b/config.toml index a3ea96ab..6350bfb8 100644 --- a/config.toml +++ b/config.toml @@ -74,9 +74,9 @@ paginatePath = "/" [[menu.main]] - identifier = 'tutorials' - name = 'Tutorials' - url = '/tutorials' + identifier = 'learn' + name = 'Learn' + url = '/' weight = 3 params = { tag = "2.0" } @@ -117,6 +117,18 @@ paginatePath = "/" parent = 'overview' weight = 2 +[[menu.main]] + name = 'šŸ“š Tutorials' + url = '/tutorials' + parent = 'learn' + weight = 1 + +[[menu.main]] + name = 'šŸ§‘ā€šŸ³ Cookbooks' + url = 'https://github.com/deepset-ai/haystack-cookbook' + parent = 'learn' + weight = 2 + # Resources children [[menu.main]] name = 'Release Notes' diff --git a/content/_index.md b/content/_index.md index 98da04a8..07b1983f 100644 --- a/content/_index.md +++ b/content/_index.md @@ -27,7 +27,7 @@ hero: text: Haystack 2.0 is built from the ground-up with production in mind. Our pipelines are fully serializable and perfect for K8s native workflows. Logging and monitoring integrations give you the transparency you need. Our deployment guides walk you through full-scale deployments on all clouds and on-prem. CTA: - link: https://www.deepset.ai/deepset-cloud?utm_campaign=developer-relations&utm_source=haystack&utm_medium=website + link: https://www.deepset.ai/deepset-cloud text: Looking for a managed solution to accelerate your time to value? logo: /images/logos/deepset-cloud.svg diff --git a/content/authors/anshul-jindal/_index.md b/content/authors/anshul-jindal/_index.md new file mode 100644 index 00000000..77bba11e --- /dev/null +++ b/content/authors/anshul-jindal/_index.md @@ -0,0 +1,17 @@ +--- +layout: author +title: Anshul Jindal +name: Anshul Jindal +slug: anshul-jindal +position: Senior Solution Architect - Cloud Services at NVIDIA +image: /images/authors/anshul-jindal.jpeg +socials: + author_page: + - name: LinkedIn + url: https://www.linkedin.com/in/ansjin/ + icon: /images/icons/linkedin-white.svg + blog_posts: + - name: LinkedIn + url: https://www.linkedin.com/in/ansjin/ + icon: /images/icons/linkedin-dark.svg +--- \ No newline at end of file diff --git a/content/authors/david-batista/_index.md b/content/authors/david-batista/_index.md index ff0f5518..728a404e 100644 --- a/content/authors/david-batista/_index.md +++ b/content/authors/david-batista/_index.md @@ -1,5 +1,6 @@ --- layout: author +title: David Batista name: David Batista slug: david-batista position: Senior NLP Engineer diff --git a/content/authors/meriem-bendris/_index.md b/content/authors/meriem-bendris/_index.md new file mode 100644 index 00000000..078a5c12 --- /dev/null +++ b/content/authors/meriem-bendris/_index.md @@ -0,0 +1,17 @@ +--- +layout: author +title: Meriem Bendris +name: Meriem Bendris +slug: Meriem Bendris +position: Senior Solution Architect - AI at NVIDIA +image: /images/authors/meriem-bendris.jpeg +socials: + author_page: + - name: LinkedIn + url: https://www.linkedin.com/in/meriem-bendris-74064530/ + icon: /images/icons/linkedin-white.svg + blog_posts: + - name: LinkedIn + url: https://www.linkedin.com/in/meriem-bendris-74064530/ + icon: /images/icons/linkedin-dark.svg +--- \ No newline at end of file diff --git a/content/blog/astradb-haystack-integration/index.md b/content/blog/astradb-haystack-integration/index.md index 8c73c060..95e62790 100644 --- a/content/blog/astradb-haystack-integration/index.md +++ b/content/blog/astradb-haystack-integration/index.md @@ -164,4 +164,4 @@ The output should look like this: ## Wrapping it up -If you've gotten this far, now you know how to use Astra DB as a data source for your Haystack pipeline. To learn more about Haystack, [join us on Discord](https://discord.gg/QMP5jgMH) or [sign up for our monthly newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=astradb-haystack-notebook). +If you've gotten this far, now you know how to use Astra DB as a data source for your Haystack pipeline. To learn more about Haystack, [join us on Discord](https://discord.gg/QMP5jgMH) or [sign up for our monthly newsletter](https://landing.deepset.ai/haystack-community-updates). diff --git a/content/blog/benchmarking-haystack-pipelines/boxplot.png b/content/blog/benchmarking-haystack-pipelines/boxplot.png new file mode 100644 index 00000000..eaf441be Binary files /dev/null and b/content/blog/benchmarking-haystack-pipelines/boxplot.png differ diff --git a/content/blog/benchmarking-haystack-pipelines/index.md b/content/blog/benchmarking-haystack-pipelines/index.md new file mode 100644 index 00000000..a6bc5963 --- /dev/null +++ b/content/blog/benchmarking-haystack-pipelines/index.md @@ -0,0 +1,674 @@ +--- +layout: blog-post +title: "Benchmarking Haystack Pipelines for Optimal Performance" +description: Step-by-step instructions to evaluate and optimize your RAG pipeline's performance +featured_image: thumbnail.png +alt_image: "'Benchmarking Haystack Pipelines for Optimal Performance' text in front of an image with illustrations about evaluation and benchmarking" +images: ["blog/benchmarking-haystack-pipelines/thumbnail.png"] +toc: True +date: 2024-06-24 +last_updated: 2024-06-24 +authors: + - David Batista +tags: ["Evaluation", "RAG"] +--- + +In this article, we will show you how to use Haystack to evaluate the performance of a RAG pipeline. Note that the code in this article is meant to be illustrative and may not run as is; if you want to run the code, please refer to the [python script](https://github.com/deepset-ai/haystack-evaluation/blob/main/evaluations/evaluation_aragog.py). + +## Introduction + +This article will guide you through building a Retrieval-Augmented Generation (RAG) pipeline using Haystack, adjusting various parameters, and evaluating it with the ARAGOG dataset. The dataset consists of pairs of questions and answers, and our objective is to assess the RAG pipeline's efficiency in retrieving the correct context and generating accurate answers. To do this, we will use the following evaluation metrics: + +- [ContextRelevance](https://docs.haystack.deepset.ai/docs/contextrelevanceevaluator) +- [Faithfulness](https://docs.haystack.deepset.ai/docs/faithfulnessevaluator) +- [Semantic Answer Similarity](https://docs.haystack.deepset.ai/docs/sasevaluator) + +We did this experiment by relying on three different Haystack pipelines with different purposes: one pipeline for indexing, another for RAG, and one for evaluation. We describe each of these pipelines in detail and show how to combine them together to evaluate the RAG pipeline. + +The article is organized as follows: we first describe the origin and authorship of the ARAGOG dataset, then we build the pipelines. We then demonstrate how to integrate everything, performing multiple runs over the dataset and adjusting parameters. These parameters were chosen based on feedback from our community, reflecting how users optimize their pipelines: + +- `top_k`: the maximum number of documents returned by the retriever. For this experiment, we tested our pipeline with `top_k` value of `[1, 2, 3]`. +- `embedding_model`: the model used to encode the documents and the question. For this example, we used these sentence-transformers models: + * `all-MiniLM-L6-v2` + * `msmarco-distilroberta-base-v2` + * `all-mpnet-base-v2` +- `chunk_size`: the number of tokens in the input text that makes up segments of text to be embedded and indexed. For this experiment, we tested our pipeline with `chunk_size` of `[64, 128, 256]`. + +We end by discussing the results of the evaluation and sharing some lessons learned. + +### The "ARAGOG: Advanced RAG Output Grading" Dataset + +The knowledge data, as well as the questions and answers, all stem from the [ARAGOG: Advanced RAG Output Grading](https://arxiv.org/pdf/2404.01037) paper. The data is a subset of the [AI ArXiv Dataset](https://huggingface.co/datasets/jamescalam/ai-arxiv) and consists of 423 selected research papers centered around the themes of Transformers and Large Language Models (LLMs). + +The evaluation dataset comprises 107 question-answer pairs (QA) generated with the assistance of GPT-4. Each QA pair is validated and corrected by humans, ensuring that the evaluation is correct and accurately measures the RAG techniquesā€™ performance in real-world applications. + +Within the scope of this article, we only considered 16 papers, the ones from which the questions were drawn, instead of the 423 papers in the original dataset, to reduce the computational cost. + +## The Indexing Pipeline + +The indexing pipeline is responsible for preprocessing and storing the documents in a [`DocumentStore`](https://docs.haystack.deepset.ai/docs/document-store). We will define a function that wraps a pipeline, takes the embedding model and the chunk size as parameters, and returns a DocumentStore for later use. The pipeline in the function first converts the PDF files into Documents, cleans them, splits them into chunks, and then embeds them using a [`SentenceTransformers`](https://docs.haystack.deepset.ai/reference/embedders-api#sentencetransformersdocumentembedder) model. The embeddings are then stored in an [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore). Learn more about creating an indexing pipeline in šŸ“š [Tutorial: Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline). + +> For this example, we store the documents using the [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), but you can use any [other document store supported by Haystack](https://docs.haystack.deepset.ai/docs/choosing-a-document-store). We split the documents by word, but you can split them by sentence or paragraph by changing the value of `split_by` parameter in the [`DocumentSplitter`](https://docs.haystack.deepset.ai/docs/documentsplitter) component. +> + +We need to pass the parameters `embedding_model` and `chunk_size` to this indexing pipeline function since we want to experiment with different indexing approaches. + +The indexing pipeline function is defined as follows: + +```python +import os + +from haystack import Pipeline +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.components.converters import PyPDFToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack.document_stores.types import DuplicatePolicy + +def indexing(embedding_model: str, chunk_size: int): + files_path = "datasets/ARAGOG/papers_for_questions" + document_store = InMemoryDocumentStore() + pipeline = Pipeline() + pipeline.add_component("converter", PyPDFToDocument()) + pipeline.add_component("cleaner", DocumentCleaner()) + pipeline.add_component("splitter", DocumentSplitter(split_length=chunk_size)) # default splitting by word + pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)) + pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model)) + pipeline.connect("converter", "cleaner") + pipeline.connect("cleaner", "splitter") + pipeline.connect("splitter", "embedder") + pipeline.connect("embedder", "writer") + pdf_files = [files_path+"/"+f_name for f_name in os.listdir(files_path)] + pipeline.run({"converter": {"sources": pdf_files}}) + + return document_store +``` + +## The RAG Pipeline + +We use a simple RAG pipeline composed of a retriever, a prompt builder, a language model, and an answer builder. First, we use the [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder) to embed the query and an [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) to retrieve the `top-k` documents relevant to the query. We then rely on an LLM to generate an answer based on the context retrieved from the documents and the query question. + +We used the OpenAI API through the [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator) with the `gpt-3.5-turbo` model in our implementation. The [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder) is responsible for building the prompt to be fed to the LLM, using a template that includes the context and the question. Finally, the [`AnswerBuilder`](https://docs.haystack.deepset.ai/docs/answerbuilder) is responsible for extracting the answer from the LLM output and returning it. Learn more about creating a RAG pipeline in šŸ“š [Tutorial: Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline). + +> Note that we instruct the LLM to explicitly answer `"None"` when the context is empty. We do this to avoid the LLM answering the prompt with its own internal knowledge. +> + +After creating the pipeline, we wrap it with a function to easily initialize it with different parameters. We expect a `document_store`, an `embedding_model`, and the `top_k` for this function. + +The RAG pipeline is defined as follows: + +```python +from haystack import Pipeline +from haystack.components.builders import PromptBuilder, AnswerBuilder +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.generators import OpenAIGenerator +from haystack.components.retrievers import InMemoryEmbeddingRetriever + +def rag_pipeline(document_store, embedding_model, top_k=2): + template = """ + You have to answer the following question based on the given context information only. + If the context is empty or just a '\\n' answer with None, example: "None". + + Context: + {% for document in documents %} + {{ document.content }} + {% endfor %} + + Question: {{question}} + Answer: + """ + + basic_rag = Pipeline() + basic_rag.add_component("query_embedder", SentenceTransformersTextEmbedder( + model=embedding_model, progress_bar=False + )) + basic_rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) + basic_rag.add_component("prompt_builder", PromptBuilder(template=template)) + basic_rag.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo")) + basic_rag.add_component("answer_builder", AnswerBuilder()) + + basic_rag.connect("query_embedder", "retriever.query_embedding") + basic_rag.connect("retriever", "prompt_builder.documents") + basic_rag.connect("prompt_builder", "llm") + basic_rag.connect("llm.replies", "answer_builder.replies") + basic_rag.connect("llm.meta", "answer_builder.meta") + basic_rag.connect("retriever", "answer_builder.documents") + + return basic_rag +``` + +## The Evaluation Pipeline + +We will also need an evaluation pipeline, which will be responsible for computing the scoring metrics to measure the performance of the RAG pipeline. You can learn how to build an evaluation pipeline in šŸ“š [Tutorial: Evaluating RAG Pipelines](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines). The evaluation pipeline will include three evaluators: + +- [ContextRelevanceEvaluator](https://docs.haystack.deepset.ai/docs/contextrelevanceevaluator) will assess the relevancy of the retrieved context to answer the query question +- [FaithfulnessEvaluator](https://docs.haystack.deepset.ai/docs/faithfulnessevaluator) evaluates whether the generated answer can be derived from the context +- [SASEvaluator](https://docs.haystack.deepset.ai/docs/sasevaluator) compares the embedding of a generated answer against a ground-truth answer based on a common embedding model. + +This new function returns the evaluation results and the inputs used to run the evaluation. This data is useful for later analysis and understanding the pipeline's performance in more detail and granularity. We need to pass the `questions` and `answers` from the dataset to the function, plus the data generated by the RAG pipeline, i.e., `retrieved_contexts`, `predicted_answers`, and the `embedding_model` used for these results. + +```python +from haystack import Pipeline +from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator + +def evaluation(questions, answers, retrieved_contexts, predicted_answers, embedding_model): + eval_pipeline = Pipeline() + eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False)) + eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False)) + eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model)) + + eval_pipeline_results = eval_pipeline.run( + { + "context_relevance": {"questions": questions, "contexts": retrieved_contexts}, + "faithfulness": {"questions": questions, "contexts": retrieved_contexts, "predicted_answers": predicted_answers}, + "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": answers}, + } + ) + + results = { + "context_relevance": eval_pipeline_results['context_relevance'], + "faithfulness": eval_pipeline_results['faithfulness'], + "sas": eval_pipeline_results['sas'] + } + + inputs = { + 'questions': sample_questions, + 'contexts': retrieved_contexts, + 'true_answers': sample_answers, + 'predicted_answers': predicted_answers + } + + return results, inputs + +``` + +## Putting it all together + +We now have the building blocks to evaluate the RAG pipeline: indexing the knowledge data, generating answers using a RAG architecture, and evaluating the results. However, we still need a method to run the questions over our RAG pipeline and collect all the needed results to perform an evaluation. +We will use a function that wraps up all the interactions with the RAG pipeline. It takes as parameters a `document_store`, the `questions`, an `embedding_model` and the `top_k` and returns the retrieved contexts and the predicted answers. + +```python +def run_rag(document_store, sample_questions, embedding_model, top_k): + """ + A function to run the basic rag model on a set of sample questions and answers + """ + + rag = rag_pipeline(document_store=document_store, embedding_model=embedding_model, top_k=top_k) + + predicted_answers = [] + retrieved_contexts = [] + for q in tqdm(sample_questions): + try: + response = rag.run( + data={"query_embedder": {"text": q}, "prompt_builder": {"question": q}, "answer_builder": {"query": q}}) + predicted_answers.append(response["answer_builder"]["answers"][0].data) + retrieved_contexts.append([d.content for d in response['answer_builder']['answers'][0].documents]) + except BadRequestError as e: + print(f"Error with question: {q}") + print(e) + predicted_answers.append("error") + retrieved_contexts.append(retrieved_contexts) + + return retrieved_contexts, predicted_answers +``` + +Notice that we wrap the call to the RAG pipeline in a try-except block to handle any errors that may occur during the pipeline's execution. This might happen, for instance, if the prompt is too bigā€”due to large contextsā€”for the model to generate an answer, if there are network errors, or simply if the model cannot generate an answer for any other reason. + +> You can decide if the LLM-based evaluators stop immediately if an error is found or if they ignore the evaluation for a particular sample and continue see, for instance in the [ContextRelevanceEvaluator](https://docs.haystack.deepset.ai/docs/contextrelevanceevaluator#overview), the `raise_on_failure` parameter. +> + +Finally, we need to run whole query questions through the pipeline over the dataset for each possible combination of the parameters `top_k`, `embedding_model`, and `chunk_size`. That's handled by the next function. + +> Note that for indexing, we only vary the `embedding_model` and `chunk_size`, as the `top_k` parameter does not affect the indexing. +> + +```python +def parameter_tuning(out_path: str): + + base_path = "../datasets/ARAGOG/" + + with open(base_path + "eval_questions.json", "r") as f: + data = json.load(f) + questions = data["questions"] + answers = data["ground_truths"] + + embedding_models = { + "sentence-transformers/all-MiniLM-L6-v2", + "sentence-transformers/msmarco-distilroberta-base-v2", + "sentence-transformers/all-mpnet-base-v2" + } + top_k_values = [1, 2, 3] + chunk_sizes = [64, 128, 256] + + # create results directory + out_path = Path(out_path) + out_path.mkdir(exist_ok=True) + + for embedding_model in embedding_models: + for chunk_size in chunk_sizes: + print(f"Indexing documents with {embedding_model} model with a chunk_size={chunk_size}") + doc_store = indexing(embedding_model, chunk_size) + for top_k in top_k_values: + name_params = f"{embedding_model.split('/')[-1]}__top_k:{top_k}__chunk_size:{chunk_size}" + print(name_params) + print("Running RAG pipeline") + retrieved_contexts, predicted_answers = run_rag(doc_store, questions, embedding_model, top_k) + print(f"Running evaluation") + results, inputs = evaluation(questions, answers, retrieved_contexts, predicted_answers, embedding_model) + eval_results = EvaluationRunResult(run_name=name_params, inputs=inputs, results=results) + eval_results.score_report().to_csv(f"{out_path}/score_report_{name_params}.csv", index=False) + eval_results.to_pandas().to_csv(f"{out_path}/detailed_{name_params}.csv", index=False) + +``` + +This function will store the results in a directory specified by the `out_path` parameter. The results will be stored in `.csv` files. For each parameter combination, there will be two files generated, one with the aggregated score report overall questions (e.g.: +`score_report_all-MiniLM-L6-v2__top_k:3__chunk_size:128.csv`) and another with the detailed results for each question (e.g.: `detailed_all-MiniLM-L6-v2__top_k:3__chunk_size:128.csv`). + +Note that we make use of the [EvaluationRunResult](https://docs.haystack.deepset.ai/reference/evaluation-api#evaluationrunresult) to store the results and generate the score report and the detailed results in the `.csv` files. + +In the next section, we will show the evaluation results and discuss the insights gained from the experiment. + +## Results Analysis + +> You can run [this notebook](https://github.com/deepset-ai/haystack-evaluation/blob/main/evaluations/analyze_aragog_parameter_search.ipynb) to visualize and analyze the results. All relevant `.csv` files can be found in the [aragog_parameter_search_2024_06_12 folder](https://github.com/deepset-ai/haystack-evaluation/tree/main/evaluations/results/aragog_parameter_search_2024_06_12). +> + +To make the analysis of the results easier, we will load all the aggregated score reports from the different parameter combinations from multiple `.csv` files into a single DataFrame. For that, we use the following code to parse the file content: + +```python +import os +import re +import pandas as pd + +def parse_results(f_name: str): + pattern = r"score_report_(.*?)__top_k:(\\d+)__chunk_size:(\\d+)\\.csv" + match = re.search(pattern, f_name) + if match: + embeddings_model = match.group(1) + top_k = int(match.group(2)) + chunk_size = int(match.group(3)) + return embeddings_model, top_k, chunk_size + else: + print("No match found") + +def read_scores(path: str): + all_scores = [] + for root, dirs, files in os.walk(path): + for f_name in files: + if not f_name.startswith("score_report"): + continue + + embeddings_model, top_k, chunk_size = parse_results(f_name) + + df = pd.read_csv(path+"/"+f_name) + + df.rename(columns={'Unnamed: 0': 'metric'}, inplace=True) + df_transposed = df.T + df_transposed.columns = df_transposed.iloc[0] + df_transposed = df_transposed[1:] + + # Add new columns + df_transposed['embeddings'] = embeddings_model + df_transposed['top_k'] = top_k + df_transposed['chunk_size'] = chunk_size + + all_scores.append(df_transposed) + + df = pd.concat(all_scores) + df.reset_index(drop=True, inplace=True) + df.rename_axis(None, axis=1, inplace=True) + + return df + +``` + +We can then read the scores from the CSV files and analyze the results. + +```python +df = read_scores('aragog_results/') +``` + +We can now analyze the results in a single table: + +| context_relevance | faithfulness | sas | embeddings | top_k | chunk_size | +| --- | --- | --- | --- | --- | --- | +| 0.834891 | 0.738318 | 0.524882 | all-MiniLM-L6-v2 | 1 | 64 | +| 0.869485 | 0.895639 | 0.633806 | all-MiniLM-L6-v2 | 2 | 64 | +| 0.933489 | 0.948598 | 0.65133 | all-MiniLM-L6-v2 | 3 | 64 | +| 0.843447 | 0.831776 | 0.555873 | all-MiniLM-L6-v2 | 1 | 128 | +| 0.912355 | NaN | 0.661135 | all-MiniLM-L6-v2 | 2 | 128 | +| 0.94463 | 0.928349 | 0.659311 | all-MiniLM-L6-v2 | 3 | 128 | +| 0.912991 | 0.827103 | 0.574832 | all-MiniLM-L6-v2 | 1 | 256 | +| 0.951702 | 0.925456 | 0.642837 | all-MiniLM-L6-v2 | 2 | 256 | +| 0.909638 | 0.932243 | 0.676347 | all-MiniLM-L6-v2 | 3 | 256 | +| 0.791589 | 0.67757 | 0.480863 | all-mpnet-base-v2 | 1 | 64 | +| 0.82648 | 0.866044 | 0.584507 | all-mpnet-base-v2 | 2 | 64 | +| 0.901218 | 0.890654 | 0.611468 | all-mpnet-base-v2 | 3 | 64 | +| 0.897715 | 0.845794 | 0.538579 | all-mpnet-base-v2 | 1 | 128 | +| 0.916422 | 0.892523 | 0.609728 | all-mpnet-base-v2 | 2 | 128 | +| 0.948038 | NaN | 0.643175 | all-mpnet-base-v2 | 3 | 128 | +| 0.867887 | 0.834112 | 0.560079 | all-mpnet-base-v2 | 1 | 256 | +| 0.946651 | 0.88785 | 0.639072 | all-mpnet-base-v2 | 2 | 256 | +| 0.941952 | 0.91472 | 0.645992 | all-mpnet-base-v2 | 3 | 256 | +| 0.909813 | 0.738318 | 0.530884 | msmarco-distilroberta-base-v2 | 1 | 64 | +| 0.88004 | 0.929907 | 0.600428 | msmarco-distilroberta-base-v2 | 2 | 64 | +| 0.918135 | 0.934579 | 0.67328 | msmarco-distilroberta-base-v2 | 3 | 64 | +| 0.885314 | 0.869159 | 0.587424 | msmarco-distilroberta-base-v2 | 1 | 128 | +| 0.953649 | 0.919003 | 0.664224 | msmarco-distilroberta-base-v2 | 2 | 128 | +| 0.945016 | 0.936916 | 0.68591 | msmarco-distilroberta-base-v2 | 3 | 128 | +| 0.949844 | 0.866822 | 0.613355 | msmarco-distilroberta-base-v2 | 1 | 256 | +| 0.952544 | 0.893769 | 0.662694 | msmarco-distilroberta-base-v2 | 2 | 256 | +| 0.964182 | 0.943925 | 0.62854 | msmarco-distilroberta-base-v2 | 3 | 256 | + +> We can see some NaN values for the faithfullness scores which is based on an LLM-based evaluator. This was due to network errors when calling the OpenAI API. +> + +Let's now see which parameter configuration yielded the **best Semantic Similarity Answer** score + +```python +df.sort_values(by=['sas'], ascending=[False]) +``` + +| context_relevance | faithfulness | sas | embeddings | top_k | chunk_size | +| --- | --- | --- | --- | --- | --- | +| 0.945016 | 0.936916 | 0.68591 | msmarco-distilroberta-base-v2 | 3 | 128 | +| 0.909638 | 0.932243 | 0.676347 | all-MiniLM-L6-v2 | 3 | 256 | +| 0.918135 | 0.934579 | 0.67328 | msmarco-distilroberta-base-v2 | 3 | 64 | +| 0.953649 | 0.919003 | 0.664224 | msmarco-distilroberta-base-v2 | 2 | 128 | +| 0.952544 | 0.893769 | 0.662694 | msmarco-distilroberta-base-v2 | 2 | 256 | +| 0.912355 | NaN | 0.661135 | all-MiniLM-L6-v2 | 2 | 128 | +| 0.94463 | 0.928349 | 0.659311 | all-MiniLM-L6-v2 | 3 | 128 | +| 0.933489 | 0.948598 | 0.65133 | all-MiniLM-L6-v2 | 3 | 64 | +| 0.941952 | 0.91472 | 0.645992 | all-mpnet-base-v2 | 3 | 256 | +| 0.948038 | NaN | 0.643175 | all-mpnet-base-v2 | 3 | 128 | +| 0.951702 | 0.925456 | 0.642837 | all-MiniLM-L6-v2 | 2 | 256 | +| 0.946651 | 0.88785 | 0.639072 | all-mpnet-base-v2 | 2 | 256 | +| 0.869485 | 0.895639 | 0.633806 | all-MiniLM-L6-v2 | 2 | 64 | +| 0.964182 | 0.943925 | 0.62854 | msmarco-distilroberta-base-v2 | 3 | 256 | +| 0.949844 | 0.866822 | 0.613355 | msmarco-distilroberta-base-v2 | 1 | 256 | +| 0.901218 | 0.890654 | 0.611468 | all-mpnet-base-v2 | 3 | 64 | +| 0.916422 | 0.892523 | 0.609728 | all-mpnet-base-v2 | 2 | 128 | +| 0.88004 | 0.929907 | 0.600428 | msmarco-distilroberta-base-v2 | 2 | 64 | +| 0.885314 | 0.869159 | 0.587424 | msmarco-distilroberta-base-v2 | 1 | 128 | +| 0.82648 | 0.866044 | 0.584507 | all-mpnet-base-v2 | 2 | 64 | +| 0.912991 | 0.827103 | 0.574832 | all-MiniLM-L6-v2 | 1 | 256 | +| 0.867887 | 0.834112 | 0.560079 | all-mpnet-base-v2 | 1 | 256 | +| 0.843447 | 0.831776 | 0.555873 | all-MiniLM-L6-v2 | 1 | 128 | +| 0.897715 | 0.845794 | 0.538579 | all-mpnet-base-v2 | 1 | 128 | +| 0.909813 | 0.738318 | 0.530884 | msmarco-distilroberta-base-v2 | 1 | 64 | +| 0.834891 | 0.738318 | 0.524882 | all-MiniLM-L6-v2 | 1 | 64 | +| 0.791589 | 0.67757 | 0.480863 | all-mpnet-base-v2 | 1 | 64 | + +Focusing on theĀ **Semantic Answer Similarity**: + +- TheĀ `msmarco-distilroberta-base-v2`Ā embeddings model with a `top_k=3` and a `chunk_size=128` yields the best results. +- In this evaluation, retrieving documents with `top_k=3` will most usually yield a higher semantic similarity score than with `top_k=1` or `top_k=2` +- It also seems that regardless of the `top_k` and `chunk_size` the best semantic similarity scores come from using the embedding modelĀ `all-MiniLM-L6-v2`Ā and theĀ `msmarco-distilroberta-base-v2` + +Let's inspect how the scores of each embedding model compare with each other in terms of **Semantic Answer Similarity**. For that, we will group the results by the embeddings column and plot the scores using box plots + +```python +from matplotlib import pyplot as plt + +fig, ax = plt.subplots(figsize=(10, 6)) +df.boxplot(column='sas', by='embeddings', ax=ax) + +plt.xlabel("Embeddings Model") +plt.ylabel("Semantic Answer Similarity Values") +plt.title("Boxplots of Semantic Answer Similarity Values Aggregated by Embeddings") + +plt.show() +``` + +![Box-plot displaying the Semantic Answer Similarity Values Aggregated by Embeddings](boxplot.png#medium) + +The box-plots above show that: + +- The `all-MiniLM-L6-v2` and the `msmarco-distilroberta-base-v2` embedding models outperform the `all-mpnet-base-v2` +- The `msmarco-distilroberta-base-v2` scores have less variance, indicating that this model is more stable to `top_k` and `chunk_size` parameter variations than the other models +- All three embedding models have an outlier corresponding to the highest-scoring and lowest-scoring parameter combination +- Not surprisingly, all the lowest scores outliers correspond to `top_k=1` and `chunk_size=64` +- The highest scores outliers correspond to `top_k=3` and a `chunk_size` of `128` or `256` + +Since we have the ground truth answers, we focuses on the **Semantic Similarity Answer**, but letā€™s also look at the **Faithfulness** and **Context Relevance** scores for a few examples. For that, we will need to load the detailed scores: + +```python +detailed_best_sas_df = pd.read_csv("results/aragog_results/detailed_all-MiniLM-L6-v2__top_k:3__chunk_size:128.csv") + +def inspect(idx): + print("Question: ") + print(detailed_best_sas_df.loc[idx]['questions']) + print("\nTrue Answer:") + print(detailed_best_sas_df.loc[idx]['true_answers']) + print() + print("Generated Answer:") + print(detailed_best_sas_df.loc[idx]['predicted_answers']) + print() + print(f"Context Relevance : {detailed_best_sas_df.loc[idx]['context_relevance']}") + print(f"Faithfulness : {detailed_best_sas_df.loc[idx]['faithfulness']}") + print(f"Semantic Similarity: {detailed_best_sas_df.loc[idx]['sas']}") +``` + +Letā€™s look at the query question 6: + +```python +inspect(6) +``` + +``` +Question: +How does BERT's performance on the GLUE benchmark compare to previous state-of-the-art models? + +True Answer: +BERT achieved new state-of-the-art on the GLUE benchmark (80.5%), surpassing the previous best models. + +Generated Answer: +BERT's performance on the GLUE benchmark significantly outperforms previous state-of-the-art models, achieving 4.5% and 7.0% respective average accuracy improvement over the prior state of the art. + +Context Relevance : 1.0 +Faithfulness : 1.0 +Semantic Similarity: 0.9051246047019958 + +Contexts: +recent work in this area. +Since its release, GLUE has been used as a testbed and showcase by the developers of several +inļ¬‚uential models, including GPT (Radford et al., 2018) and BERT (Devlin et al., 2019). As shown +in Figure 1, progress on GLUE since its release has been striking. On GLUE, GPT and BERT +achieved scores of 72.8 and 80.2 respectively, relative to 66.5 for an ELMo-based model (Peters +et al., 2018) and 63.7 for the strongest baseline with no multitask learning or pretraining above the +word level. Recent models (Liu et al., 2019d; Yang et al., 2019) have clearly surpassed estimates of +non-expert human performance on GLUE (Nangia and Bowman, 2019). The success of these models +on GLUE has been driven by ever-increasing model capacity, compute power, and data quantity, as +well as innovations in +--------- +56.0 75.1 +BERT BASE 84.6/83.4 71.2 90.5 93.5 52.1 85.8 88.9 66.4 79.6 +BERT LARGE 86.7/85.9 72.1 92.7 94.9 60.5 86.5 89.3 70.1 82.1 +Table 1: GLUE Test results, scored by the evaluation server ( https://gluebenchmark.com/leaderboard ). +The number below each task denotes the number of training examples. The ā€œAverageā€ column is slightly different +than the ofļ¬cial GLUE score, since we exclude the problematic WNLI set.8BERT and OpenAI GPT are single- +model, single task. F1 scores are reported for QQP and MRPC, Spearman correlations are reported for STS-B, and +accuracy scores are reported for the other tasks. We exclude entries that use BERT as one of their components. +We use a batch size of 32 and ļ¬ne-tune for 3 +epochs over the data for all GLUE tasks. For each +task, we selected the best ļ¬ne-tuning learning rate +(among 5e-5, +--------- +4e-5, 3e-5, and 2e-5) on the Dev set. +Additionally, for BERT LARGE we found that ļ¬ne- +tuning was sometimes unstable on small datasets, +so we ran several random restarts and selected the +best model on the Dev set. With random restarts, +we use the same pre-trained checkpoint but per- +form different ļ¬ne-tuning data shufļ¬‚ing and clas- +siļ¬er layer initialization.9 +Results are presented in Table 1. Both +BERT BASE and BERT LARGE outperform all sys- +tems on all tasks by a substantial margin, obtaining +4.5% and 7.0% respective average accuracy im- +provement over the prior state of the art. Note that +BERT BASE and OpenAI GPT are nearly identical +in terms of model architecture apart from the at- +tention masking. For the largest and most widely +reported GLUE task, MNLI, BERT obtains a 4.6% +absolute accuracy improvement. On the ofļ¬cial +GLUE leaderboard10, BERT LARGE obtains a score +of +--------- +``` + +In this example, the context relevancy and faithfulness scores are both 1.0. This indicates that the context is relevant to the question and our RAG LLM used this context to generate a semantically similar answer to the correct (ground-truth) answer. + +Letā€™s take a look at another example: + +```python +inspect(44) +``` + +``` +Question: +How should future language model benchmarks be structured to ensure a holistic assessment of models' capabilities and knowledge breadth? + +True Answer: +Future benchmarks should integrate a broader spectrum of subjects and cognitive skills, emphasizing the inclusion of tasks that test models' ethical reasoning, understanding of human values, and ability to perform complex problem-solving, beyond the mere scale of data and parameters. + +Generated Answer: +Future language model benchmarks should be structured to measure knowledge acquired during pretraining by evaluating models exclusively in zero-shot and few-shot settings across a diverse set of subjects that humans learn. The benchmark should cover a wide range of subjects across STEM, humanities, social sciences, and more, ranging in difficulty from elementary to advanced professional levels. It should test both world knowledge and problem-solving ability, ensuring a holistic assessment of models' capabilities and knowledge breadth. + +Context Relevance : 0.6 +Faithfulness : 1.0 +Semantic Similarity: 0.6483339071273804 + +Contexts: +learning model +usage should be developed for guiding users to learn ā€˜Dosā€™ +and Dontā€™ in AI. Detailed policies could also be proposed +to list all userā€™s responsibilities before the model access. +C. Language Models Beyond ChatGPT +The examination of ethical implications associated with +language models necessitates a comprehensive examina- +tion of the broader challenges that arise within the domainof language models, in light of recent advancements in +the field of artificial intelligence. The last decade has seen +a rapid evolution of AI techniques, characterized by an +exponential increase in the size and complexity of AI +models, and a concomitant scale-up of model parameters. +The scaling laws that govern the development of language +models,asdocumentedinrecentliterature[84,85],suggest +thatwecanexpecttoencounterevenmoreexpansivemod- +els that incorporate multiple modalities in the near future. +Efforts to integrate multiple modalities into a single model +are driven by the ultimate goal of realizing the concept of +foundation models [86]. +--------- +language models are +at learning and applying knowledge from many domains. +To bridge the gap between the wide-ranging knowledge that models see during pretraining and the +existing measures of success, we introduce a new benchmark for assessing models across a diverse +set of subjects that humans learn. We design the benchmark to measure knowledge acquired during +pretraining by evaluating models exclusively in zero-shot and few-shot settings. This makes the +benchmark more challenging and more similar to how we evaluate humans. The benchmark covers +57subjects across STEM, the humanities, the social sciences, and more. It ranges in difļ¬culty from +an elementary level to an advanced professional level, and it tests both world knowledge and problem +solving ability. Subjects range from traditional areas, such as mathematics and history, to more +1arXiv:2009.03300v3 [cs.CY] 12 Jan 2021Published as a conference paper at +--------- +a +lack of access to the benefits of these models for people +who speak different languages and can lead to biased or +unfairpredictionsaboutthosegroups[14,15].Toovercome +this, it is crucial to ensure that the training data contains +a substantial proportion of diverse, high-quality corpora +from various languages and cultures. +b) Robustness: Another major ethical consideration +in the design and implementation of language models is +their robustness. Robustness refers to a modelā€™s ability +to maintain its performance when given input that is +semantically or syntactically different from the input it +was trained on. +Semantic Perturbation: Semantic perturbation is a type +of input that can cause a language model to fail [40, 41]. +This input has different syntax but is semantically similar +to the input used for training the model. To address this, +it is crucial to ensure that the training data is diverse and +representative of the population it will +--------- +``` + +It seems that for this question, the content is not completely relevant (Context Relevance = 0.6) and only the second context was used to generate the answer. + +## Running your own experiments + +If you want to run this experiment yourself, follow the Python code [`evaluation_aragog.py`](https://github.com/deepset-ai/haystack-evaluation/blob/main/evaluations/evaluation_aragog.py) in the [haystack-evaluation](https://github.com/deepset-ai/haystack-evaluation) repository. + +Start by cloning the repository + +```bash +git clone https://github.com/deepset-ai/haystack-evaluation +cd haystack-evaluation +cd evaluations +``` + +Next, run the Python script: + +```bash +usage: evaluation_aragog.py [-h] --output_dir OUTPUT_DIR [--sample SAMPLE] +``` + +You can specify the output directory to hold the results and the sample size, i.e.: how many questions to use for the evaluation. + +> Donā€™t forget to define your Open AI API key in the environmental variable `OPENAI_API_KEY` +> + +```bash + OPENAI_API_KEY= python evaluation_aragog.py --output-dir experiment_a --sample 10 +``` + +## Execution Time and Costs + +> NOTE: all the numbers reported were run on an Mac Book Pro Apple M3 Pro with 36GB of RAM with Haystack 2.2.1 and Python 3.9 +> + +### Indexing + +The Indexing pipeline needs to consider the parameter combinations defined below: + +- 3 different values for `embedding_model` +- 3 different `chunk_size` values + +Therefore, the index **runs 9 times in total.** + +### RAG Pipeline + +The RAG pipeline needs to run 27 times, since the following parameters affect the retrieval process: + +- 3 different values for `embedding_model` +- 3 different `top_k` values +- 3 different `chunk_size` values + +This needs to run for each of the 107 questions, so in total, the **RAG pipeline will run 2.889 times** (3 x 3 x 3 x 107) and produce **2889 calls to OpenAI API**. + +### Evaluation Pipeline + +The Evaluation pipeline also runs 27 times since all parameter combinations need to be evaluated for each of the 107 questions. Note, however, that the Evaluation pipeline contains two Evaluators that rely on an LLM through OpenAI API, so this pipeline **runs 2.889 times**. However, due to the Faithfulness and ContextRelevance evaluators, it will produce **5.778 (2 x 2.889) calls to OpenAI API**. + +You can see the detailed running times for each parameter combination in the [Benchmark Times Spreadsheet](https://docs.google.com/spreadsheets/d/1LTogSuZuzCVNDGBl7Jk5XjmaPYnBSWumaiOwn0WCOfc/edit?usp=sharing). + +### Pricing + +For detailed pricing information, visit [OpenAI Pricing](https://openai.com/api/pricing/) šŸ’ø + +## Lessons Learned + +In this article, we have shown how to use the Haystack [Evaluators](https://docs.haystack.deepset.ai/docs/evaluators) to find the best combination of parameters that yield the best performance of our RAG pipeline, as opposed to using only the default parameters. + +For this ARAGOG dataset, in particular, the best performance is achieved using the `msmarco-distilroberta-base-v2` embeddings model instead of the default model (`sentence-transformers/all-mpnet-base-v2`), together with a `top_k=3` and a `chunk_size=128`. + +**A few learnings are important to take:** + +- When using an LLM through an external API, it is important to **account for potential network errors or other issues**. Ensure that during your experiments, running the questions through the RAG pipeline or evaluating the results doesnā€™t crash due to an error, for instance, by wrapping the call within a `try/except` code block. +- Before starting your experiments, **estimate the costs and time involved**. If you plan to use an external LLM through an API, calculate approximately how many API calls you will need to run queries through your RAG pipeline and evaluate the results if you use LLM-based evaluators. This will help you understand the total costs and time required for your experiments. +- Depending on your dataset size and running time, **Python notebooks might not be the best approach to run your experiments**; a Python script is probably a more reliable solution. +- **Beware of which parameters affect which components**. For instance, for indexing, only the `embedding_model` and the `chunk_size` are important - this can reduce the number of experiments you need to carry out. + +Explore a variety of evaluation examples tailored to different use cases and datasets by visiting the [haystack-evaluation](https://github.com/deepset-ai/haystack-evaluation) repository on GitHub. diff --git a/content/blog/benchmarking-haystack-pipelines/thumbnail.png b/content/blog/benchmarking-haystack-pipelines/thumbnail.png new file mode 100644 index 00000000..43c2bf8d Binary files /dev/null and b/content/blog/benchmarking-haystack-pipelines/thumbnail.png differ diff --git a/content/blog/business-intelligence-sql-queries-llm/index.md b/content/blog/business-intelligence-sql-queries-llm/index.md index bbe0dcde..fa6a6007 100644 --- a/content/blog/business-intelligence-sql-queries-llm/index.md +++ b/content/blog/business-intelligence-sql-queries-llm/index.md @@ -203,7 +203,7 @@ Another factor that comes into play here is that databases themselves can be amb ## Demo -In addition to our dataset and benchmarks, we also published a [demo](https://cloud.deepset.ai/shared_prototypes?share_token=prototype_eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDY5NjQ3NzMuMjI3LCJhdWQiOiJleHRlcm5hbCB1c2VyIiwiaXNzIjoiZEMiLCJ3b3Jrc3BhY2VfaWQiOiIwNTdjOWI5MC1jNzQzLTRlOTEtYjI3OS02ZWEwNTcwMThlYjIiLCJ3b3Jrc3BhY2VfbmFtZSI6InN0YWNrb3ZlcmZsb3ctc3VydmV5Iiwib3JnYW5pemF0aW9uX2lkIjoiNGM2MTkwMGYtMTBiNi00MDljLTkzNjQtMGE2NzlhY2NjMWM5Iiwic2hhcmVfaWQiOiI3NzQ0OGMxZS00Mzc3LTRiNDAtYjU3Ni1iNDRhN2VlM2E5MjUifQ.MdTRh2Wu0ziHmwRu6kEvKONS0a5w5Qcr8ChQlmMlu54) of our best-performing text-to-SQL approach to share with colleagues, friends, and now you, our readers. It serves to validate whether our accuracy results are representative of a real-world scenario. We found that user feedback was quite close to our results. The demo is hosted on deepset's [enterprise platform, deepset Cloud](https://www.deepset.ai/deepset-cloud), which creates an out-of-the-box user interface. Here's what it looks like: +In addition to our dataset and benchmarks, we also published a [demo](https://cloud.deepset.ai/shared_prototypes?share_token=prototype_eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NDIwMjgyMjUuOTkzLCJhdWQiOiJleHRlcm5hbCB1c2VyIiwiaXNzIjoiZEMiLCJ3b3Jrc3BhY2VfaWQiOiIwNTdjOWI5MC1jNzQzLTRlOTEtYjI3OS02ZWEwNTcwMThlYjIiLCJ3b3Jrc3BhY2VfbmFtZSI6InN0YWNrb3ZlcmZsb3ctc3VydmV5Iiwib3JnYW5pemF0aW9uX2lkIjoiNGM2MTkwMGYtMTBiNi00MDljLTkzNjQtMGE2NzlhY2NjMWM5Iiwic2hhcmVfaWQiOiJhOTEyYzllYS0yNTIxLTRkMzctYTM3ZS02ZDVkYWI2ZDQ1MjUifQ.pULl5Oud-piPgHx3ff5lovH0xZ3KjrEjObfjEiZje-0) of our best-performing text-to-SQL approach to share with colleagues, friends, and now you, our readers. It serves to validate whether our accuracy results are representative of a real-world scenario. We found that user feedback was quite close to our results. The demo is hosted on deepset's [enterprise platform, deepset Cloud](https://www.deepset.ai/deepset-cloud), which creates an out-of-the-box user interface. Here's what it looks like: !["Screenshot of deepset Cloud search interface."](dc.png) diff --git a/content/blog/extracting-metadata-filter/index.md b/content/blog/extracting-metadata-filter/index.md new file mode 100644 index 00000000..31d257ed --- /dev/null +++ b/content/blog/extracting-metadata-filter/index.md @@ -0,0 +1,266 @@ +--- +layout: blog-post +title: "Advanced Retrieval: Extract Metadata from Queries to Improve Retrieval" +description: Use LLMs to extract metadata from queries to use as filters that improve retrieval in RAG applications. +featured_image: thumbnail.png +alt_image: A colorful cartoon-style digital illustration of a V60 coffee filter displayed on a computer screen standing in front of an orange background. There are papers going into the filter. +images: ["blog/extracting-metadata-filter/thumbnail.png"] +toc: True +date: 2024-05-13 +last_updated: 2024-05-13 +authors: + - David Batista + - Bilge Yucel +tags: ["Retrieval", "RAG", "Advanced Use Cases"] +cookbook: extracting_metadata_filters_from_a_user_query.ipynb +--- + +> This is part one of the **Advanced Use Cases** series: +> +> 1ļøāƒ£ Extract Metadata from Queries to Improve Retrieval +> +> 2ļøāƒ£ Automatic Metadata Enrichment šŸ”œ +> +> 3ļøāƒ£ Query Decomposition šŸ”œ +> +> 4ļøāƒ£ Query Expansion šŸ”œ + + +In Retrieval-Augmented Generation (RAG) applications, the retrieval step, which provides relevant context to your large language model (LLM), is vital for generating high-quality responses. There are possible ways of improving retrieval and **metadata filtering** is one of the easiest ways. [Metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering), the approach of limiting the search space based on some concrete metadata, can really enhance the quality of the retrieved documents. Here are some advantages of using metadata filtering: + +1. **Relevance**: Metadata filtering narrows down the information being retrieved. This ensures that the generated responses align with the specific query or topic. +2. **Accuracy**: Filtering based on metadata such as domain, source, date, or topic guarantees that the information used for generation is accurate and trustworthy. This is particularly important for applications where accuracy is paramount. For instance, if you need information about a specific year, using the year as a metadata filter will retrieve only pertinent data. +3. **Efficiency**: Eliminating irrelevant or low-quality information boosts the efficiency of your RAG application, reduces the amount of processing needed, and speeds up retrieval response times. + +You have two options for applying the metadata filter: you can either specify it directly when running the pipeline or, you can extract it from the query itself. In this article, we'll focus on extracting filters from a query to improve the quality of generated responses in RAG applications. Let's get started. + +## Introduction to Metadata Filters + +First things first, what is metadata? Metadata (or meta tag) is actually data about your data, used to categorize, sort, and filter information based on various attributes such as date, topic, source, or any other information that you find relevant. After incorporating meta information into your data, you can apply filters to queries used with [Retrievers](https://docs.haystack.deepset.ai/docs/retrievers) to limit the scope of your search based on this metadata and ensure that your answers come from a specific slice of your data. + +Imagine that you have following Documents in your document store: + +```python +documents = [ + Document( + content="Some text about revenue increase", + meta={"year": 2022, "company": "Nvidia", "name":"A"}), + Document( + content="Some text about revenue increase", + meta={"year": 2023, "company": "Nvidia", "name":"B"}), + Document( + content="Some text about revenue increase", + meta={"year": 2022, "company": "BMW", "name":"C"}), + Document( + content="Some text about revenue increase", + meta={"year": 2023, "company": "BMW", "name":"D"}), + Document( + content="Some text about revenue increase", + meta={"year": 2022, "company": "Mercedes", "name":"E"}), + Document( + content="Some text about revenue increase", + meta={"year": 2023, "company": "Mercedes", "name":"F"}), +] +``` + +When the query is ā€œ_Causes of the revenue increase_ā€, the retriever returns all documents as they all contain some information about revenue. However, the metadata filter below ensures that any returned document by the retriever has a value ofĀ `2022`Ā in theĀ `year`Ā metadata field and eitherĀ `BMW`Ā orĀ `Mercedes`Ā in theĀ `company`Ā metadata field. So, only documents with name ā€œ**C**ā€ and ā€œ**E**ā€ are retrieved. + +```python +pipeline.run( + data={ + "retriever":{ + "query": "Causes of the revenue increase", + "filters": { + "operators": "AND", + "conditions": [ + {"field": "meta.year", "operator": "==", "value": "2022"}, + {"field": "meta.company", "operator": "in", "value": ["BMW", "Mercedes"]} + ] + } + } + } +) +``` + +In this example, we pass the filter explicitly, but sometimes, the query itself might contain information that can be used as a metadata filter during the querying process. In this case, we need to *preprocess* the query to extract filters before we use it with a retriever. + +## Extracting Metadata Filters from a Query + +In LLM-based applications, queries are written in natural language. From time to time, they include valuable hints that can be used as metadata filters to improve the retrieval. We can extract these hints, formulate them as metadata filters and use them with the retriever alongside the query. For instance, when the query is ā€œ*What was the revenue of Nvidia in 2022?*ā€, we can extract `2022` as `years` and `Nvidia` as `companies`. Based on this information, formulated metadata filter to use with a retriever should look like: + +```python +"filters": { + "operators": "AND", + "conditions": [ + {"field": "meta.years", "operator": "==", "value": "2022"}, + {"field": "meta.companies", "operator": "==", "value": "Nvidia"} + ] +} +``` + +Thankfully, LLMs are highly capable of extracting structured information from unstructured text. Letā€™s see step-by-step how we can implement a custom component that uses an LLM to extract keywords, phrases, or entities from the query and formulate the metadata filter. + +## Implementing `QueryMetadataExtractor` + +> šŸ§‘ā€šŸ³ You can find and run all the code in our cookbook [Extrating Metadata Filter from a Query](https://github.com/deepset-ai/haystack-cookbook/blob/main/notebooks/extracting_metadata_filters_from_a_user_query.ipynb) + +We start by creating a [custom component](https://docs.haystack.deepset.ai/docs/custom-components), `QueryMetadataExtractor`, which takes `query` and `metadata_fields` as inputs and outputs `filters`. This component encapsulates a generative pipeline, made up of [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder) and [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator). The pipeline instructs the LLM to extract keywords, phrases, or entities from a given query which can then be used as metadata filters. In the prompt, we include instructions to ensure the output format is in JSON and provide `metadata_fields` along with the `query` to ensure the correct entities are extracted from the query. + +Once the pipeline is initialized in the `init` method of the component, we post-process the LLM output in the `run` method. This step ensures the extracted metadata is correctly formatted to be used as a metadata filter. + +```python +import json +from typing import Dict, List + +from haystack import Pipeline, component +from haystack.components.builders import PromptBuilder +from haystack.components.generators import OpenAIGenerator + +@component() +class QueryMetadataExtractor: + + def __init__(self): + prompt = """ + You are part of an information system that processes users queries. + Given a user query you extract information from it that matches a given list of metadata fields. + The information to be extracted from the query must match the semantics associated with the given metadata fields. + The information that you extracted from the query will then be used as filters to narrow down the search space + when querying an index. + Just include the value of the extracted metadata without including the name of the metadata field. + The extracted information in 'Extracted metadata' must be returned as a valid JSON structure. + ### + Example 1: + Query: "What was the revenue of Nvidia in 2022?" + Metadata fields: {"company", "year"} + Extracted metadata fields: {"company": "nvidia", "year": 2022} + ### + Example 2: + Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?" + Metadata fields: {"disease", "year"} + Extracted metadata fields: {"disease": "Alzheimer", "year": 2023} + ### + Example 3: + Query: "{{query}}" + Metadata fields: "{{metadata_fields}}" + Extracted metadata fields: + """ + self.pipeline = Pipeline() + self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt)) + self.pipeline.add_component(name="llm", instance=OpenAIGenerator(model="gpt-3.5-turbo")) + self.pipeline.connect("builder", "llm") + + @component.output_types(filters=Dict[str, str]) + def run(self, query: str, metadata_fields: List[str]): + result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}}) + metadata = json.loads(result['llm']['replies'][0]) + + # this can be done with specific data structures and in a more sophisticated way + filters = [] + for key, value in metadata.items(): + field = f"meta.{key}" + filters.append({f"field": field, "operator": "==", "value": value}) + + return {"filters": {"operator": "AND", "conditions": filters}} +``` + +First, let's test the `QueryMetadataExtractor` in isolation, passing a query and a list of metadata fields. + +```python +extractor = QueryMetadataExtractor() + +query = "What were the most influential publications in 2022 regarding Parkinson's disease?" +metadata_fields = {"disease", "year"} + +result = extractor.run(query, metadata_fields) +print(result) +``` + +The result should look like this: + +```bash +{'filters': {'operator': 'AND', + 'conditions': [ + {'field': 'meta.disease', 'operator': '==', 'value': 'Alzheimers'}, + {'field': 'meta.year', 'operator': '==', 'value': 2023} + ]} +} +``` + +Notice that the `QueryMetadataExtractor` has extracted the metadata fields from the query and returned them in a format that can be used as filters passed directly to a `Retriever`. By default, the `QueryMetadataExtractor` will use all metadata fields as conditions together with an `AND` operator. + +## Using `QueryMetadataExtractor` in a Pipeline + +Now, let's plug the `QueryMetadataExtractor` into a `Pipeline` with a `Retriever` connected to a `DocumentStore` to see how it works in practice. + +We start by creating a [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore) and adding some documents to it. We include info about ā€œyearā€ and ā€œdiseaseā€ in the ā€œmetaā€ field of each document. + +```python +from haystack import Document +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.document_stores.types import DuplicatePolicy + +documents = [ + Document( + content="some publication about Alzheimer prevention research done over 2023 patients study", + meta={"year": 2022, "disease": "Alzheimer", "author": "Michael Butter"}), + Document( + content="some text about investigation and treatment of Alzheimer disease", + meta={"year": 2023, "disease": "Alzheimer", "author": "John Bread"}), + Document( + content="A study on the effectiveness of new therapies for Parkinson's disease", + meta={"year": 2022, "disease": "Parkinson", "author": "Alice Smith"} + ), + Document( + content="An overview of the latest research on the genetics of Parkinson's disease and its implications for treatment", + meta={"year": 2023, "disease": "Parkinson", "author": "David Jones"} + ) +] + +document_store = InMemoryDocumentStore(bm25_algorithm="BM25Plus") +document_store.write_documents(documents=documents, policy=DuplicatePolicy.OVERWRITE) +``` + +We then create a pipeline consisting of the `QueryMetadataExtractor` and a [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) connected to the `InMemoryDocumentStore` created above. + +> Learn about connecting components and creating pipelines in [Docs: Creating Pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines). +> + +```python +from haystack import Pipeline, Document +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever + +retrieval_pipeline = Pipeline() +metadata_extractor = QueryMetadataExtractor() +retriever = InMemoryBM25Retriever(document_store=document_store) + +retrieval_pipeline.add_component(instance=metadata_extractor, name="metadata_extractor") +retrieval_pipeline.add_component(instance=retriever, name="retriever") +retrieval_pipeline.connect("metadata_extractor.filters", "retriever.filters") +``` + +Now define a query and metadata fields and pass them to the pipeline: + +```python +query = "publications 2023 Alzheimer's disease" +metadata_fields = {"year", "author", "disease"} + +retrieval_pipeline.run(data={"metadata_extractor": {"query": query, "metadata_fields": metadata_fields}, "retriever":{"query": query}}) +``` + +This returns only documents whose metadata field `year = 2023` and `disease = Alzheimer` + +```python +{'documents': + [Document( + id=e3b0bfd497a9f83397945583e77b293429eb5bdead5680cc8f58dd4337372aa3, + content: 'some text about investigation and treatment of Alzheimer disease', + meta: {'year': 2023, 'disease': 'Alzheimer', 'author': 'John Bread'}, + score: 2.772588722239781)] + } +``` + +## Conclusion + +Metadata filtering stands out as a powerful technique for improving the relevance and accuracy of retrieved documents, thus enabling the generation of high-quality responses in RAG applications. Using the custom component `QueryMetadataExtractor` we implemented, we can extract filters from user queries and directly use them with Retrievers. + +This article was part one of the **Advanced Use Cases** series. If you want to stay on top of the latest Haystack developments, you canĀ [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates)Ā orĀ [join our Discord community](https://discord.gg/DzJEUKkuHp) šŸ’™ diff --git a/content/blog/extracting-metadata-filter/thumbnail.png b/content/blog/extracting-metadata-filter/thumbnail.png new file mode 100644 index 00000000..6c82d752 Binary files /dev/null and b/content/blog/extracting-metadata-filter/thumbnail.png differ diff --git a/content/blog/haystack-2-release/index.md b/content/blog/haystack-2-release/index.md index 03e95339..25d7768c 100644 --- a/content/blog/haystack-2-release/index.md +++ b/content/blog/haystack-2-release/index.md @@ -151,6 +151,6 @@ And, as always, keep an eye out on our [blog](https://haystack.deepset.ai/blog) Stay up-to-date with Haystack: - [Discord](https://discord.com/invite/VBpFzsgRVF) -- [Subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates?utm-campaign=developer-relations&utm-source=blog&utm-medium=release) +- [Subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) - [Twitter](https://twitter.com/Haystack_AI) - [GitHub](https://github.com/deepset-ai/haystack) \ No newline at end of file diff --git a/content/blog/haystack-nvidia-nim-rag-guide/index.md b/content/blog/haystack-nvidia-nim-rag-guide/index.md new file mode 100644 index 00000000..1ea008ad --- /dev/null +++ b/content/blog/haystack-nvidia-nim-rag-guide/index.md @@ -0,0 +1,629 @@ +--- +layout: single +title: Building RAG Applications with NVIDIA NIM and Haystack on K8s +description: How to self-host and orchestrate NVIDIA NIM for Haystack RAG pipelines in Kubernetes. +toc: True +date: 2024-06-02 +last_updated: 2024-06-02 +authors: + - Anshul Jindal + - Meriem Bendris + - Tuana Celik + - Tilde Thurium +tags: ["Integrations", "Haystack 2.0"] +--- + +Retrieval-augmented generation (RAG) systems combine generative AI with information retrieval for contextualized answer generation. Building reliable and performant RAG applications at scale is challenging. In this blog, we show how to use Haystack and NVIDIA NIM to create a RAG solution which is easy to deploy/maintain, standardized and enterprise-ready, that can run on-prem as well as on cloud native environments. This recipe is applicable in the cloud, on-premise or even in air-gapped environments. + +## About Haystack + +[Haystack](https://haystack.deepset.ai/), by [deepset](https://www.deepset.ai/), is an open source framework for building production-ready LLM applications, RAG pipelines and state-of-the-art search systems that work intelligently over large document collections. + +![Figure 1 - Haystack Retrieval-augmented generation (RAG) pipeline. ](nvidia-image-1.png#small "_Figure 1 - Haystack Retrieval-augmented generation (RAG) pipeline_") + + +Haystackā€™s [growing ecosystem of community integrations](https://haystack.deepset.ai/integrations) provide tooling for evaluation, monitoring, transcription, data ingestion and more. The [NVIDIA Haystack integration](https://haystack.deepset.ai/integrations/nvidia) allows using NVIDIA models and NIMs in Haystack pipelines, [giving the flexibility to pivot from prototyping in the cloud to deploying on-prem](https://haystack.deepset.ai/blog/haystack-nvidia-integration). + +## About NVIDIA NIM + +NVIDIA NIM is a collection of containerized microservices designed for optimized inference of state-of-the-art AI models. The container uses a variety of components to serve AI models and exposes them via standard API. Models are optimized using [TensorRT](https://developer.nvidia.com/tensorrt) or [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (depending on the type of the model), applying procedures such as quantization, model distribution, optimized kernel/runtimes and inflight- or continuous batching among others allowing even further optimization if needed. Learn more about NIM [here](https://developer.nvidia.com/blog/nvidia-nim-offers-optimized-inference-microservices-for-deploying-ai-models-at-scale/). + + +This tutorial shows how to build a Haystack RAG pipeline leveraging NVIDIA NIMs hosted on the [NVIDIA API catalog](https://build.nvidia.com/). Then, we provide instructions on deploying NIMs on your infrastructure in a Kubernetes environment for self-hosting [AI foundation models](https://www.nvidia.com/en-us/ai-data-science/foundation-models/?_gl=1*3m0pk5*_gcl_au*ODg0NTI0MDQ3LjE3MTczMTI1MDE.). Note that hosting NIMs requires [NVIDIA AI Enterprise license](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/?_gl=1*1crq8g6*_gcl_au*NzMwODYxMzc1LjE3MTczMTIxMzg.). + + +## Build a Haystack RAG Pipeline with NVIDIA NIMs hosted on the NVIDIA API Catalog + +For RAG pipelines, Haystack provides 3 components that can be connected with NVIDIA NIM: +- [NvidiaGenerator](https://docs.haystack.deepset.ai/docs/nvidiagenerator): Text generation with LLM NIM. +- [NvidiaDocumentEmbedder](https://docs.haystack.deepset.ai/docs/nvidiadocumentembedder): Document embedding with [NVIDIA NeMo Retriever Embedding NIM](https://build.nvidia.com/nvidia/embed-qa-4). +- [NvidiaTextEmbedder](https://docs.haystack.deepset.ai/docs/nvidiatextembedder): Query embedding with NVIDIA NeMo Retriever Embedding NIM. + +![Figure 2 - Haystack Indexing and RAG pipeline with NVIDIA NIMs](nvidia-image-2.png#small "_Figure 2 - Haystack Indexing and RAG pipelines with NVIDIA NIMs_") + +For this section, we have provided scripts and instructions for building a RAG pipeline leveraging NIMs hosted on the [NVIDIA API catalog](https://build.nvidia.com/) as part of the [GitHub repository](https://github.com/deepset-ai/nvidia-haystack). We also provide a [Jupyter Notebook](https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/rag-with-nims.ipynb) for building the same RAG pipeline using NIMs deployed on your infrastructure in a Kubernetes environment. + +## Vectorize Documents with Haystack Indexing Pipelines + +Our indexing pipeline implementation is available in the [indexing tutorial](https://github.com/deepset-ai/nvidia-haystack/blob/main/indexing.py). Haystack provides several [preprocessing](https://docs.haystack.deepset.ai/docs/preprocessors) components for document cleaning, splitting, [embedders](https://docs.haystack.deepset.ai/docs/converters), as well as [converters](https://docs.haystack.deepset.ai/docs/converters) extracting data from files in different formats. In this tutorial, we will store PDF files in a `QdrantDocumentStore`. `NvidiaDocumentEmbedder` is used to connect with NIMs hosted on the [NVIDIA API catalog](https://build.nvidia.com/). Below is an example of how to initialize the embedder component with the [`snowflake/arctic-embed-l`](https://build.nvidia.com/snowflake/arctic-embed-l) NIM hosted on the NVIDIA API catalog. + +```python +from haystack.utils.auth import Secret +from haystack_integrations.components.embedders.nvidia import NvidiaDocumentEmbedder + + +embedder = NvidiaDocumentEmbedder(model="snowflake/arctic-embed-l", + api_url="https://ai.api.nvidia.com/v1/retrieval/snowflake/arctic-embed-l", + batch_size=1) +``` + +## Creating the Haystack RAG Pipeline + +In our example, we will create a simple question/answering RAG pipeline using both NVIDIA NeMo Retriever Embedding NIM and LLM NIM. For this pipeline, we use the `NvidiaTextEmbedder` to embed the query for retrieval, and the `NvidiaGenerator` to generate a response. Example below shows how to instantiate the generator using [`meta/llama3-70b-instruct`](https://build.nvidia.com/meta/llama3-70b) LLM NIM hosted on the NVIDIA API catalog. + +```python +generator = NvidiaGenerator( + model="meta/llama3-70b-instruct", + api_url="https://integrate.api.nvidia.com/v1", + model_arguments={ + "max_tokens": 1024 + } +) +``` + + +We use Haystack pipelines to connect various components of this RAG pipeline including query embedders and LLM generators. Below is an example of a RAG pipeline: + +```python +from haystack import Pipeline +from haystack.utils.auth import Secret +from haystack.components.builders import PromptBuilder +from haystack_integrations.components.embedders.nvidia import NvidiaTextEmbedder +from haystack_integrations.components.generators.nvidia import NvidiaGenerator +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever +from haystack_integrations.document_stores.qdrant import QdrantDocumentStore + +document_store = QdrantDocumentStore(embedding_dim=1024, host="qdrant") + +embedder = NvidiaTextEmbedder(model="snowflake/arctic-embed-l", + api_key=Secret.from_env_var("NVIDIA_EMBEDDINGS_KEY"), + api_url="https://ai.api.nvidia.com/v1/retrieval/snowflake/arctic-embed-l") + +retriever = QdrantEmbeddingRetriever(document_store=document_store) + +prompt = """Answer the question given the context. +Question: {{ query }} +Context: +{% for document in documents %} + {{ document.content }} +{% endfor %} +Answer:""" +prompt_builder = PromptBuilder(template=prompt) + +generator = NvidiaGenerator( + model="meta/llama3-70b-instruct", + api_url="https://integrate.api.nvidia.com/v1", + model_arguments={ + "max_tokens": 1024 + } +) + +rag = Pipeline() +rag.add_component("embedder", embedder) +rag.add_component("retriever", retriever) +rag.add_component("prompt", prompt_builder) +rag.add_component("generator", generator) + +rag.connect("embedder.embedding", "retriever.query_embedding") +rag.connect("retriever.documents", "prompt.documents") +rag.connect("prompt", "generator") +``` + +## Indexing Files and Deploying the Haystack RAG Pipeline + +[Hayhooks](https://docs.haystack.deepset.ai/docs/hayhooks) allows the deployment of RAG pipelines in a containerized environment. In our example, we have provided a [docker-compose file](https://github.com/deepset-ai/nvidia-haystack/blob/main/docker-compose.yml) to setup both the Qdrant database, and the RAG pipeline. As we are leveraging NIMs hosted on the [NVIDIA API catalog](https://build.nvidia.com/), we need to set the API keys for the NIMs in the `.env` file. The instructions below expect `NVIDIA_API_KEY` (for `NvidiaGenerator`) and `NVIDIA_EMBEDDINGS_KEY` (for `NvidiaDocumentEmbedder` and `NvidiaTextEmbedder`). + +Executing `docker-compose up` will launch `3` containers: **qdrant**, **hayhooks** and **qdrant-setup** (which will run our indexing pipeline and stop). The Qdrant database will be deployed on the localhost and exposed at port `6333`. The Qdrant dashboard allows users to inspect the vectorized documents at [localhost:6333/dashboard](localhost:6333/dashboard). + +### Serializing Pipelines + +Haystack pipelines defined in Python can be serialized to YAML by calling `dump()` on the pipeline object, as shown in our [RAG pipeline tutorial](https://github.com/deepset-ai/nvidia-haystack/blob/77cc316193e718de51b8a56e756749604b8032e9/rag.py#L44C1-L45C16). The [YAML](https://github.com/deepset-ai/nvidia-haystack/blob/main/rag.yaml) definition is as follows: + +```yaml +components: + embedder: + ... + type: haystack_integrations.components.embedders.nvidia.text_embedder.NvidiaTextEmbedder + generator: + init_parameters: + api_key: + ... + type: haystack_integrations.components.generators.nvidia.generator.NvidiaGenerator + prompt: + init_parameters: + template: "Answer the question given the context.\nQuestion: {{ query }}\nContext:\n\ + {% for document in documents %}\n {{ document.content }}\n{% endfor %}\n\ + Answer:" + type: haystack.components.builders.prompt_builder.PromptBuilder + retriever: + init_parameters: + document_store: + init_parameters: + ... + type: haystack_integrations.document_stores.qdrant.document_store.QdrantDocumentStore + ... + type: haystack_integrations.components.retrievers.qdrant.retriever.QdrantEmbeddingRetriever + +connections: +- receiver: retriever.query_embedding + sender: embedder.embedding +- receiver: prompt.documents + sender: retriever.documents +- receiver: generator.prompt + sender: prompt.prompt +max_loops_allowed: 100 +metadata: {} +``` + +### Deploy the RAG Pipeline + +To deploy the RAG pipeline, execute `hayhooks deploy rag.yaml` which will expose the pipeline on [http://localhost:1416/rag](http://localhost:1416/rag) by default. You can then visit [http://localhost:1416/docs](http://localhost:1416/docs) for the API docs and try out the pipeline. + +![](nvidia-image-3.png) + +![Figure 3 - API Doc UI interface for trying out the RAG Pipeline ](nvidia-image-4.png#small "_Figure 3 - API Doc UI interface for trying out the RAG Pipeline_") + +For production, Haystack provides Helm charts and [instructions](https://docs.haystack.deepset.ai/docs/kubernetes) to create services running Hayhooks with a container orchestrator like Kubernetes. + +In the next sections, we will show how to deploy, monitor and autoscale NIMs on your infrastructure in a Kubernetes environment for self-hosting [AI foundation models](https://www.nvidia.com/en-us/ai-data-science/foundation-models/?_gl=1*3m0pk5*_gcl_au*ODg0NTI0MDQ3LjE3MTczMTI1MDE.). Finally, we will provide instructions on how to use them in the Haystack RAG pipeline. + +## Self-hosting NVIDIA NIMs on a Kubernetes cluster + +### Kubernetes Cluster Environment + +In this tutorial, the setup environment consists of a DGX H100 with 8 H100 GPUs each having 80GB of memory as host and with Ubuntu as the operating system. Docker is used as the container runtime. Kubernetes is deployed on it using [Minikube](https://minikube.sigs.k8s.io/). To enable GPU utilization in Kubernetes, we install essential NVIDIA software components using the [GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html). + +### NVIDIA NIMs Deployment + +As part of this setup, we deploy following NIMs into the Kubernetes cluster using Helm charts: +- The LLM NIM, which uses the model [`llama3-8b-instruct`](https://build.nvidia.com/meta/llama3-8b) +- The NVIDIA NeMo Retriever Embedding NIM, which uses the model [`NV-Embed-QA`](https://build.nvidia.com/nvidia/embed-qa-4) + +The LLM NIM Helm chart is on [GitHub](https://github.com/NVIDIA/nim-deploy), while the NVIDIA NeMo Retriever Embedding NIM Helm chart is in the NGC private registry, requiring Early Access ([apply for Early Access](https://developer.nvidia.com/nemo-microservices)). Figure 4 illustrates the deployment of NIMs on a Kubernetes cluster running on a DGX H100. The GPU Operator components are deployed via its Helm chart and are part of the GPU Operator stack. Prometheus and Grafana are deployed via Helm charts for monitoring the Kubernetes cluster and the NIM. + +![Figure 4 - NVIDIA NIMs and other components deployment on a Kubernetes cluster ](nvidia-image-5.png#small "_Figure 4 - NVIDIA NIMs and other components deployment on a Kubernetes cluster_") + +The LLM NIM Helm chart contains the LLM NIM container, which runs within a pod and references the model via [Persistent Volume](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) (PV) and [Persistent Volume Claim](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims) (PVC). The LLM NIM pods are autoscaled using the [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) (HPA) based on custom metrics and are exposed via Kubernetes [ClusterIP](https://kubernetes.io/docs/concepts/services-networking/service/#type-clusterip) service. To access the LLM NIM, we deploy an [ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) and expose it at the `/llm` endpoint. + +Similarly, the NeMo Retriever Embedding NIM Helm chart includes the Retriever Embedding NIM container, which runs within a pod and references the model on the host via PV and PVC. The NeMo Retriever Embedding NIM pods are also autoscaled via HPA and are exposed via Kubernetes ClusterIP service. To access the NeMo Retriever Embedding NIM, we deploy an ingress and expose it at the `/embedding` endpoint. + +Users and other applications can access the exposed NIMs via the ingress. The vector database Qdrant is deployed using this [helm chart](https://qdrant.tech/documentation/guides/installation/#kubernetes). + +Now, let's take a closer look at the deployment process for each NIM: + +### LLM NIM deployment + +1. Create the namespace, if it is not already created yet: + +```kubectl create namespace nim-llm``` + +2. Add a Docker registry secret that will be used for pulling NIM containers from NGC and replace `` with the API key from NGC. Follow this [link](https://docs.nvidia.com/ngc/gpu-cloud/ngc-user-guide/index.html#generating-api-key) for generating an API key in NGC. + +``` +kubectl create secret -n nim-llm docker-registry nvcrimagepullsecret \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' --docker-password= + +``` + +3. Create a generic secret `ngc-api`, which is used to pull the model within the NIM container. + +``` +kubectl create secret -n nim-llm generic ngc-api \ + --from-literal=NGC_CLI_API_KEY= +``` + +4. Create `nim-llm-values.yaml` file with the below content. Adjust `repository` and `tag` values depending on your environment. + +```yaml +image: + repository: "nvcr.io/nvidia/nim/nim-llm/meta-llama3-8b-instruct" # container image location + tag: 24.05 # LLM NIM version you want to deploy + +model: + ngcAPISecret: ngc-api # name of a secret in the cluster that includes a key named NGC_CLI_API_KEY and is an NGC API key +resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 +persistence: + enabled: true + size: 30Gi +imagePullSecrets: + - name: nvcrimagepullsecret # secret created to pull nvcr.io image +``` + +5. We assume that the helm chart for the LLM NIM is located here: `./nims/helm/nim-llm/`. You can change the command accordingly depending on where the helm chart is located. Deploy the LLM NIM by running the following command: + +```helm -n nim-llm install nim-llm -f ./nims/helm/nim-llm/ nim-llm-values.yaml``` + +6. The deployment takes a few minutes to start the containers, download models, and become ready. You can monitor the pods with the below command: + +``` +kubectl get pods -n nim-llm +``` + +#### Example Output + +``` +NAME READY STATUS RESTARTS AGE +nim-llm-0 1/1 Running 0 8m21s +``` + +7. Install an [ingress controller](https://kubernetes.io/docs/concepts/services-networking/ingress-controllers/), if it has not been installed already. Then, create a file `ingress-nim-llm.yaml` with the below content to create the ingress for the LLM NIM. Make sure to change the host (here nims.example.com) with your fully qualified domain name. + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: nim-llm-ingress + namespace: nim-llm + annotations: + nginx.ingress.kubernetes.io/use-regex: "true" + nginx.ingress.kubernetes.io/rewrite-target: /$2 +spec: + rules: + - host: nims.example.com + http: + paths: + - path: /llm(/|$)(.*) + pathType: ImplementationSpecific + backend: + service: + name: nim-llm + port: + number: 8000 +``` + +Deploy the ingress with the below command: + +``` +kubectl apply -f ingress-nim-llm.yaml +``` +8. Access the exposed service by making a curl request for testing (replace `nims.example.com` with you own fully qualified domain name) + +```bash +curl -X 'POST' 'http://nims.example.com/llm/v1/chat/completions' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "messages": [ + { + "content": "You are a polite and respectful chatbot helping people plan a vacation.", + "role": "system" + }, + { + "content": "What shall i do in France in one line?", + "role": "user" + } + ], + "model": "meta-llama3-8b-instruct", + "temperature": 0.5, + "max_tokens": 1024, + "top_p": 1, + "stream": false +}' +``` + +#### Example output: + +```json +{"id":"cmpl-0027fdbe808747e987c444d1f86b0543","object":"chat.completion","created":1716325880,"model":"meta-llama3-8b-instruct","choices":[{"index":0,"message":{"role":"assistant","content":"In France, you can stroll along the Seine River in Paris, visit the iconic Eiffel Tower, indulge in croissants and cheese, and explore the charming streets of Montmartre, or head to the French Riviera for a luxurious getaway."},"logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":{"prompt_tokens":39,"total_tokens":92,"completion_tokens":53}} +``` + +Now, we have the LLM NIM up and running. + +### NeMo Retriever Embedding NIM deployment + +The deployment of the NeMo Retriever Embedding NIM is similar to the LLM NIM. + +1. Follow steps 1 - 3 as LLM NIM deployment but replace namespace with `nim-embedding` in the commands. + +2. Create `nim-embedding-values.yaml` file with the below content. Adjust following: + - `ngcModel.org` : The ID of the organization where the model is located in NGC. + - `ngcModel.path` : Replace `` with the ID of the organization and `` with the team name under the organization where the model is located. + - `image.repository` and `image.tag` values depending on your environment. + +```yaml +ngcModel: + directoryName: nv-embed-qa_v4 + org: + path: //nv-embed-qa:4 + template: NV-Embed-QA_template.yaml + name: NV-Embed-QA-4.nemo + +replicaCount: 1 + +image: + repository: nvcr.io/nvidia/nim/nemo-retriever-embedding-microservice + tag: "24.04" + +imagePullSecrets: + - name: nvcrimagepullsecret + +envVars: + - name: TRANSFORMERS_CACHE + value: /scratch/.cache + +modelStorage: + class: "" + size: 10Gi + +service: + type: ClusterIP + port: 8080 +``` + +3. We assume that the helm chart for the NeMo Retriever Embedding NIM is located here: `./nims/helm/nim-embedding/`. You can change the command accordingly depending on where the helm chart is located. Deploy the NeMo Retriever Embedding NIM by running the following command + +``` +cd ./nims/helm/nim-embedding/ && helm dependency build + +helm -n nim-embedding install nim-embedding -f ./nims/helm/nim-embedding/ nim-embedding-values.yaml +``` + +4. The deployment takes a few minutes to start the containers, download models, and become ready. You can monitor the pods with the below command: + +``` +kubectl get pods -n nim-embedding +``` + +#### Example Output + +``` +NAME READY STATUS RESTARTS AGE +nim-embedding-nemo-embedding-ms-d58c.. 1/1 Running 0 87m +``` + +5. Create a file `ingress-nim-embedding.yaml` similar to the LLM NIM ingress with service name `nim-embedding-nemo-embedding-ms`, port `8080`, and path `/embedding(/|$)(.*)`. Afterwards, deploy the ingress. + +6. Access the exposed service by making a curl request for testing (replace in below the `nims.example.com` with your fully qualified domain name). + +```bash +curl 'GET' \ + 'http://nims.example.com/embedding/v1/models' \ + -H 'accept: application/json' +``` + +#### Example output: + +```json +{"object":"list","data":[{"id":"NV-Embed-QA","created":0,"object":"model","owned_by":"organization-owner"}]} +``` + +Now, we have the NeMo Retriever Embedding NIM up and running. + +Once the above procedure is completed, you will have API endpoints of LLM NIM and NeMo Retriever Embedding NIM. + +## Operational Considerations + +Monitoring and autoscaling are essential for deployed NIMs to ensure efficient, effective, and reliable operation. Monitoring tracks performance metrics, detects errors, and optimizes resource utilization, while autoscaling dynamically adjusts resources to match changing workloads, ensuring the NIMs can handle sudden spikes or dips in demand. This enable NIMs to provide accurate and timely responses, even under heavy loads, while optimizing costs and maintaining high availability. In this section, we will delve into details of deploying monitoring and enabling autoscaling for NIMs. + +### Monitoring + +NVIDIA NIM metrics are collected with the open-source tool [Prometheus](https://prometheus.io/) and visualized with the [Grafana](https://grafana.com/) dashboards. NVIDIA [dcgm-exporter](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/dcgme2e.html#gpu-telemetry) is the preferred tool to collect GPU telemetry. We follow the instructions from [here](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/kube-prometheus.html) for the deployment of Prometheus and Grafana. + +#### Visualizing NVIDIA NIM Metrics + +By default, NVIDIA NIM metrics are exposed at [http://localhost:8000/metrics](http://localhost:8000/metrics) by the NIM container. All the exposed metrics are listed [here](https://docs.nvidia.com/nim/large-language-models/latest/observability.html). Using a Prometheus ServiceMonitor they can be published to Prometheus and viewed in the Grafana dashboard. The [Prometheus ServiceMonitor](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/running-exporters.md#the-goal-of-servicemonitors) is used to define application to scrape metrics from within Kubernetes cluster. + + +1. Create a file `service-monitor-nim-llm.yaml` with the below content. We currently only configure it to scrape metrics from LLM NIM but can be extended to other NIMs as well. + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: nim-llm-sm + namespace: nim-llm +spec: + endpoints: + - interval: 30s + targetPort: 8000 + path: /metrics + namespaceSelector: + matchNames: + - nim-llm + selector: + matchLabels: + app.kubernetes.io/name: nim-llm +``` + +2. Create a Prometheus ServiceMonitor using the below command: + +``` +kubectl apply -f service-monitor-nim-llm.yaml +``` + +In the Prometheus UI under `Status -> Targets`, you will see the below ServiceMonitor once itā€™s deployed. + +![Figure 5 - Prometheus UI showing the deployed ServiceMonitor ](nvidia-image-6.png#small "_Figure 5 - Prometheus UI showing the deployed ServiceMonitor_") + + +3. Letā€™s check some inference metrics on the Prometheus UI. Figure 6 shows the stacked graph for `request_success_total` NIM metric. + +![Figure 6 - Prometheus UI showing the plot of request_success_total metric indicating number of finished requests. ](nvidia-image-7.png#small "_Figure 6 - Prometheus UI showing the plot of request_success_total metric indicating number of finished requests_") + + +### Autoscaling NVIDIA NIM + +In this tutorial, we use the [Kubernetes Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) - HPA to adjust the scaling of the NIM pods. We have defined custom metrics to monitor the average GPU usage of each NVIDIA NIM and used by the Horizontal Pod Autoscaler (HPA) to dynamically adjust the number of NIM pods. See the metrics definition below: + +- **nim_llm_gpu_avg** : `avg by (kubernetes_node, pod, namespace, gpu) (DCGM_FI_DEV_GPU_UTIL{pod=~"nim-llm-.*"})` +- **nim_embedding_gpu_avg** : `avg by (kubernetes_node, pod, namespace, gpu) (DCGM_FI_DEV_GPU_UTIL{pod=~"nim-emedding-.*"})` + +The average GPU usage metric is used as an example and must be adjusted to the specific application environment. + +Letā€™s deploy the HPA. + +1. Create a file with the name `prometheus_rule_nims.yaml` with the below content to create the Prometheus rules for the above custom metric. Adjust the labels (app, other Prometheus labels) according to the current deployed Prometheus instance. + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app: kube-prometheus-stack + app.kubernetes.io/instance: kube-prometheus-stack-1710254997 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kube-prometheus-stack + app.kubernetes.io/version: 56.8.2 + chart: kube-prometheus-stack-56.8.2 + heritage: Helm + release: kube-prometheus-stack-1710254997 + name: kube-prometheus-stack-1709-gpu.rules + namespace: prometheus +spec: + groups: + - name: gpu.rules + rules: + - expr: avg by (kubernetes_node, pod, namespace, gpu) (DCGM_FI_DEV_GPU_UTIL{pod=~"nim-llm-.*"}) + record: nim_llm_gpu_avg + - expr: avg by (kubernetes_node, pod, namespace, gpu) (DCGM_FI_DEV_GPU_UTIL{pod=~"nim-embedding-.*"}) + record: nim_embedding_gpu_avg +``` + +2. Create custom Prometheus recording rules by running the below command: + +``` +kubectl apply -f prometheus_rule_nims.yaml +``` + +3. In Prometheus UI, under `Status -> Rules`, you can see the above two created rules as shown in Figure 7. + +![Figure 7 - Prometheus rules tab showing the created custom rules to record GPU usage by NVIDIA NIM. ](nvidia-image-8.png#small "_Figure 7 - Prometheus rules tab showing the created custom rules to record GPU usage by NVIDIA NIM_") + +4. Install [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) to query the custom metrics based on the custom recording rules created above and register them to the custom metrics API for HPA to fetch. Replace in below command `` with the name of the Prometheus service in Kubernetes. + +``` +helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter --set prometheus.url="http://.prometheus.svc.cluster.local" +``` + +5. Query the custom metrics API to see if the metrics have been registered using the below command: + +``` +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq -r . | grep llms +``` + +#### Example Output: + +``` +"name": "namespaces/nim_embedding_gpu_avg", +"name": "pods/nim_embedding_gpu_avg", +"name": "pods/nim_llm_gpu_avg", +"name": "namespaces/nim_llm_gpu_avg", +``` + +6. A separate HPA definition is created for the two NVIDIA NIM. Within this definition, we specify the minimum and maximum number of replicas, the metric to monitor, and the target value for that metric. Below is the definition for the LLM NIM HPA and you can create the similar for the NeMo Retriever Embedding NIM using `nim_embedding_gpu_avg` metric. + +LLM NIM HPA file: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: nim-llm-hpa + namespace: nim-llm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: StatefulSet + name: nim-llm + minReplicas: 1 + maxReplicas: 4 + metrics: + - type: Pods + pods: + metric: + name: nim_llm_gpu_avg + target: + type: AverageValue + averageValue: 30 + +``` + +7. Create the two HPAs using the below commands: + +``` +kubectl apply -f hpa_nim_llm.yaml +kubectl apply -f hpa_nim_embedding.yaml +``` + +8. Check the status of HPAs: + +```kubectl get hpa -A``` + +#### Example Output: + +``` +NAMESPACE NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE +nim-embedding nim-embedding-hpa Deployment/nim-embedding-nemo-embedding-ms 0/30 1 4 1 94s +nim-llm nim-llm-hpa StatefulSet/nim-llm 0/30 1 4 1 94s +``` + +9. Send some requests to LLM NIM and see the LLM NIM pod getting scaled as shown below: + +``` +NAME READY STATUS RESTARTS AGE +nim-llm-0 1/1 Running 0 3h47m +nim-llm-1 1/1 Running 0 3m30s +``` + +Also, Figure 8 shows the Prometheus graph showing the scaling of LLM NIM. + +![Figure 8 - Prometheus graph showing the scaling of LLM NIM. ](nvidia-image-9.png#small "_Figure 8 - Prometheus graph showing the scaling of LLM NIM._") + +We have now deployed NIMs on your infrastructure in a scalable fashion. We can now use them in the RAG pipeline. The next section provides the details for the same. + +## Use Self-hosted NVIDIA NIMs in the RAG Pipeline + +This section provides instructions to use previously deployed NIMs on your infrastructure in a Kubernetes cluster for `NvidiaTextEmbedder`, `NvidiaDocumentEmbedder` and `NvidiaGenerator` in the Haystack RAG pipeline, replacing `` with the endpoint of the NeMo Retriever Embedding NIM and `` with the LLM NIM. The provided [notebook](https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/rag-with-nims.ipynb) in the repository has examples of how to use the self-hosted NIMs. + +*NvidiaDocumentEmbedder*: +```python +embedder = NvidiaDocumentEmbedder( + model=embedding_nim_model, + api_url="http:///v1" +) +``` + +*NvidiaTextEmbedder*: +```python +# initialize NvidiaTextEmbedder with the self-hosted NeMo Retriever Embedding NIM URL +embedder = NvidiaTextEmbedder( + model=embedding_nim_model, + api_url="http:///v1" +) +``` + + +*NvidiaGenerator*: +```python +# initialize NvidiaGenerator with the self-hosted LLM NIM URL +generator = NvidiaGenerator( + model=llm_nim_model_name, + api_url="http:///v1", + model_arguments={ + "temperature": 0.5, + "top_p": 0.7, + "max_tokens": 2048, + }, +) +``` + +## Summary + +In this blog, we provide a comprehensive walkthrough for building robust and scalable RAG applications using Haystack and NVIDIA NIMs. We cover building the RAG pipeline by leveraging NIMs hosted on the [NVIDIA API catalog](https://build.nvidia.com/) and also using self-hosted NIMs deployed on your infrastructure in a Kubernetes environment. Our step-by-step instructions detail how to deploy NIMs in a Kubernetes cluster, monitor their performance, and scale them as needed. + +By leveraging proven deployment patterns, our architecture ensures a responsive user experience and predictable query times, even in the face of high or bursty user queries and document indexing workloads. Moreover, our deployment recipe is flexible, allowing for easy implementation in cloud, on-premise, or air-gapped environments. With this guide, we aim to provide a resource for anyone looking to build reliable and performant RAG applications at scale. \ No newline at end of file diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-1.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-1.png new file mode 100644 index 00000000..afa790b5 Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-1.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-2.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-2.png new file mode 100644 index 00000000..dbdfb992 Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-2.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-3.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-3.png new file mode 100644 index 00000000..d0dc821a Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-3.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-4.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-4.png new file mode 100644 index 00000000..043ef6fc Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-4.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-5.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-5.png new file mode 100644 index 00000000..4f5e547b Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-5.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-6.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-6.png new file mode 100644 index 00000000..0c5a9eee Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-6.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-7.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-7.png new file mode 100644 index 00000000..462cace3 Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-7.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-8.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-8.png new file mode 100644 index 00000000..96075bbe Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-8.png differ diff --git a/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-9.png b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-9.png new file mode 100644 index 00000000..fe514536 Binary files /dev/null and b/content/blog/haystack-nvidia-nim-rag-guide/nvidia-image-9.png differ diff --git a/content/blog/highlights-of-2023/index.md b/content/blog/highlights-of-2023/index.md index 440cf77e..1468d6ce 100644 --- a/content/blog/highlights-of-2023/index.md +++ b/content/blog/highlights-of-2023/index.md @@ -14,7 +14,7 @@ authors: tags: ["Open Source", "Haystack 2.0", "Community"] --- -> *This post was originally shared through the Haystack newsletter. [Subscribe now](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=highlights_of_2023) to stay in the loop on all things Haystack, including the latest updates, new features, captivating content, and upcoming exciting events!* šŸ—žļø +> *This post was originally shared through the Haystack newsletter. [Subscribe now](https://landing.deepset.ai/haystack-community-updates) to stay in the loop on all things Haystack, including the latest updates, new features, captivating content, and upcoming exciting events!* šŸ—žļø What a fantastic year it has been! In 2023, the world of AI saw tremendous progress, making it an exciting time for learning and growth. As we wind down and approach the new year, it's a good moment to take a breather, look back on the past year, and appreciate the highlights before gearing up for what's ahead. We've taken some time to review the noteworthy moments that made 2023 special for Haystack and its community. @@ -92,4 +92,4 @@ deepset, the company behind Haystack, announced a **$30 million funding round** - Dive into the world of Haystack 2.0-beta with our latest discussion entry, designed to be your go-to resource until the stable release of Haystack 2.0. Find the roadmap, release notes, and continually updated feature table in [the ultimate guide for Haystack 2.0-beta](https://github.com/deepset-ai/haystack/discussions/6684). - We have just introduced a new repository: [Haystack Cookbook](https://github.com/deepset-ai/haystack-cookbook). Stay tuned as we expand this collection with even more example notebooks throughout 2024. -To stay informed about everything related to Haystack, such as the latest updates, new features, captivating content, and upcoming exciting events, be sure to [subscribe to the monthly Haystack newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=highlights_of_2023)! šŸ—žļø \ No newline at end of file +To stay informed about everything related to Haystack, such as the latest updates, new features, captivating content, and upcoming exciting events, be sure to [subscribe to the monthly Haystack newsletter](https://landing.deepset.ai/haystack-community-updates)! šŸ—žļø \ No newline at end of file diff --git a/content/blog/langfuse-integration/index.md b/content/blog/langfuse-integration/index.md new file mode 100644 index 00000000..bd4211eb --- /dev/null +++ b/content/blog/langfuse-integration/index.md @@ -0,0 +1,230 @@ +--- +layout: blog-post +title: 'Monitor and trace your Haystack pipelines with Langfuse' +description: With the new Langfuse integration, it's easier than ever to have visibility into how your pipelines are performing. +featured_image: thumbnail.png +images: ["blog/langfuse-integration/thumbnail.png"] +alt_image: An image of a robot tracing a self-portrait, with the logos for Haystack and Langfuse overlaid on top. +toc: True +date: 2024-05-17 +last_updated: 2024-05-17 +authors: + - Tilde Thurium +tags: ["Monitoring", "Evaluation", "Tracing"] +--- + +Getting your LLM application into production is a huge milestone, but that's only the beginning. It's critical to monitor how your pipeline is performing in the real world so you can keep improving performance and cost, and proactively address any issues that might arise. + +With the new [Haystack Langfuse integration](https://haystack.deepset.ai/integrations/langfuse), it's now easier than ever to have visibility into your pipelines. In this post, we'll explain more about Langfuse, and demonstrate how to trace an end to end request to a Haystack pipeline. + +### What is Langfuse? + +Langfuse is an open source LLM engineering platform. It offers a ton of features to help you understand how your LLM application is performing under the hood. + +### Langfuse features and benefits +- Track model usage and cost +- Collect user feedback +- Identify low-quality outputs +- Build fine-tuning and testing datasets +- Open source šŸ’™ +- Self-hosted version available +- Frequent releases with new features and improvements +- as of the time of this writing, free to try out šŸ¤‘ + +## Getting started + +In order to use this integration, you'll need to [sign up for a Langfuse account](https://langfuse.com/). See [the Langfuse docs for the most up-to-date information](https://langfuse.com/docs) about features and pricing. + +### Prerequisites + +First, [sign up for an account on the Langfuse website](https://langfuse.com/). + +On the Langfuse dashboard, make a note of your `LANGFUSE_SECRET_KEY` and `LANGFUSE_PUBLIC_KEY`. Set them as environment variables. While you're at it, set the `HAYSTACK_CONTENT_TRACING_ENABLED` environment variable to `true` in order to enable Haystack tracing in your pipeline. + +The following code examples also require an `OPENAI_API_KEY` environment variable to be set. Haystack is model-agnostic and you can use [any model provider we support](https://docs.haystack.deepset.ai/docs/generators), by changing the generator in the code samples below. + +### Installation + +To install the integration run the following command in your terminal: +```bash +pip install langfuse-haystack +``` +To use Langfuse in a pipeline you'll need a few additional dependencies: + +```bash +pip install sentence-transformers datasets +``` + +## Use Langfuse in a RAG pipeline + +First, import all the modules you'll need. +```python +from datasets import load_dataset +from haystack import Document, Pipeline +from haystack.components.builders import PromptBuilder +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.generators import OpenAIGenerator +from haystack.components.retrievers import InMemoryEmbeddingRetriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack_integrations.components.connectors.langfuse import LangfuseConnector +``` + +Next, write a function that takes a `DocumentStore` and returns a Haystack RAG pipeline. Add the [`LangfuseConnector`](https://docs.haystack.deepset.ai/docs/langfuseconnector) to your pipeline, but don't connect it to any other component in the pipeline. +```python +def get_pipeline(document_store: InMemoryDocumentStore): + retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=2) + + template = """ + Given the following information, answer the question. + Context: + {% for document in documents %} + {{ document.content }} + {% endfor %} + Question: {{question}} + Answer: + """ + + prompt_builder = PromptBuilder(template=template) + + basic_rag_pipeline = Pipeline() + # Add components to your pipeline + basic_rag_pipeline.add_component("tracer", LangfuseConnector("Basic RAG Pipeline")) + basic_rag_pipeline.add_component( + "text_embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") + ) + basic_rag_pipeline.add_component("retriever", retriever) + basic_rag_pipeline.add_component("prompt_builder", prompt_builder) + basic_rag_pipeline.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo", generation_kwargs={"n": 2})) + + # Now, connect the components to each other + # NOTE: the tracer component doesn't need to be connected to anything in order to work + basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + basic_rag_pipeline.connect("retriever", "prompt_builder.documents") + basic_rag_pipeline.connect("prompt_builder", "llm") + + return basic_rag_pipeline +``` + +Now, instantiate the pipeline using an `InMemoryDocumentStore` to keep things simple. Generate some embeddings based on the [7 wonders of the world dataset](https://huggingface.co/datasets/bilgeyucel/seven-wonders), and populate them into our document store. If you were running this code in production, you'd probably want to use an indexing pipeline to load the data into the store, but for demo purposes this approach reduces complexity. + +```python +document_store = InMemoryDocumentStore() +dataset = load_dataset("bilgeyucel/seven-wonders", split="train") +embedder = SentenceTransformersDocumentEmbedder("sentence-transformers/all-MiniLM-L6-v2") +embedder.warm_up() +docs_with_embeddings = embedder.run([Document(**ds) for ds in dataset]).get("documents") or [] # type: ignore +document_store.write_documents(docs_with_embeddings) +``` +Run the pipeline and ask it a question. +```python +pipeline = get_pipeline(document_store) +question = "What does Rhodes Statue look like?" +response = pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}}) +``` +Setting the `HAYSTACK_CONTENT_TRACING_ENABLED` environment variable automatically traces every request that the pipeline runs. If all goes well you should receive something like the following output: +```python +# {'tracer': {'name': 'Basic RAG Pipeline', 'trace_url': 'https://cloud.langfuse.com/trace/3d52b8cc-87b6-4977-8927-5e9f3ff5b1cb'}, 'llm': {'replies': ['The Rhodes Statue was described as being about 105 feet tall, with iron tie bars and brass plates forming the skin. It was built on a white marble pedestal near the Rhodes harbour entrance. The statue was filled with stone blocks as construction progressed.', 'The Rhodes Statue was described as being about 32 meters (105 feet) tall, built with iron tie bars, brass plates for skin, and filled with stone blocks. It stood on a 15-meter-high white marble pedestal near the Rhodes harbor entrance.'], 'meta': [{'model': 'gpt-3.5-turbo-0125', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 100, 'prompt_tokens': 453, 'total_tokens': 553}}, {'model': 'gpt-3.5-turbo-0125', 'index': 1, 'finish_reason': 'stop', 'usage': {'completion_tokens': 100, 'prompt_tokens': 453, 'total_tokens': 553}}]}} +``` +Dumping tracing output in the terminal, is pretty cool, but the integration also sends the info to Langfuse. The Langfuse dashboard has a much more comprehensive and beautiful UI so you can make sense of your pipeline. Let's hop over there and take a look. + +## Use Langfuse in a RAG pipeline with chat +Agent and chat use cases are rising in popularity. If you wanted to use the integration to trace a pipeline that includes a chat generator component, here's an example of how to do so. +```python +from haystack import Pipeline +from haystack.components.builders import DynamicChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack_integrations.components.connectors.langfuse import LangfuseConnector + +pipe = Pipeline() +pipe.add_component("tracer", LangfuseConnector("Chat example")) +pipe.add_component("prompt_builder", DynamicChatPromptBuilder()) +pipe.add_component("llm", OpenAIChatGenerator(model="gpt-3.5-turbo")) + +pipe.connect("prompt_builder.prompt", "llm.messages") +messages = [ + ChatMessage.from_system("Always respond in German even if some input data is in other languages."), + ChatMessage.from_user("Tell me about {{location}}"), +] + +response = pipe.run( + data={"prompt_builder": {"template_variables": {"location": "Berlin"}, "prompt_source": messages}} +) +print(response["llm"]["replies"][0]) +print(response["tracer"]["trace_url"]) +# ChatMessage(content='Berlin ist die Hauptstadt von Deutschland und zugleich eines der bekanntesten kulturellen Zentren Europas. Die Stadt hat eine faszinierende Geschichte, die bis in die Zeiten des Zweiten Weltkriegs und des Kalten Krieges zurĆ¼ckreicht. Heute ist Berlin fĆ¼r seine vielfƤltige Kunst- und Musikszene, seine historischen StƤtten wie das Brandenburger Tor und die Berliner Mauer sowie seine lebendige Street-Food-Kultur bekannt. Berlin ist auch fĆ¼r seine grĆ¼nen Parks und Seen beliebt, die den Bewohnern und Besuchern Raum fĆ¼r Erholung bieten.', role=, name=None, meta={'model': 'gpt-3.5-turbo-0125', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 137, 'prompt_tokens': 29, 'total_tokens': 166}}) +# https://cloud.langfuse.com/trace/YOUR_UNIQUE_IDENTIFYING_STRING +``` + +## Explore the Langfuse dashboard +Once youā€™ve run these code samples, [head over to the Langfuse dashboard](https://langfuse.com/docs/demo) to see and interact with traces. As of the time of this writing, the demo is free to try. + +![Screenshot of the Langfuse dashboard showing Traces, Scores, Model Cost, Model Usage.](langfuse-tracing-dashboard.png) + +### Trace Detail + +Trace details show cost and latency for a specific end-to-end request. This data is helpful for estinating usage and cost of a RAG application in production. For example, here is the trace detail for the text embedder step of the pipeline we just ran. For [a comprehensive explanation of LLM tracing, see the Langfuse docs](https://langfuse.com/docs/tracing). + +![Screenshot of the Langfuse dashboard showing a trace detail including inputs, outputs, and metadata for the text_embedder component of a Haystack RAG pipeline.](langfuse-embedder-span.png) + +The right sidebar shows latency for every step of the pipeline, which helps to pinpoint performance bottlenecks. + +Trace details that are tagged "generation" also show the monetary cost of the request. + +![Screenshot of the Langfuse dashboard showing a trace detail including inputs, outputs, cost, and metadata for the generator component of a Haystack RAG pipeline.](langfuse-generation-span.png) + +Traces can also be segmented by user or by session, so you can more granularly understand the user's journey. + +### Evaluation +Evaluation helps us understand the quality of the results the LLM application is returning to the end user. There are currently 4 ways to add scores into Langfuse: +- Manual evaluation +- User feedback +- Model-based evaluation +- Custom via SDKs/API + +For the sake of time, this post will only cover manual evaluation, but [see the Langfuse docs for comprehensive info on all the evaluation methods](https://langfuse.com/docs/scores/overview). + +Clicking on a trace, you can manually add a score to note the quality of that specific request. + +For this trace, the `input` shows us our prompt, interpolated with the actual context that was passed to the LLM. Cool! + +``` +Input: + + Given the following information, answer the question. + Context: + + Within it, too, are to be seen large masses of rock, by the weight of which the artist steadied it while erecting it.[22][23] +Destruction of the remains[edit] +The ultimate fate of the remains of the statue is uncertain. Rhodes has two serious earthquakes per century, owing to its location on the seismically unstable Hellenic Arc. Pausanias tells us, writing ca. 174, how the city was so devastated by an earthquake that the Sibyl oracle foretelling its destruction was considered fulfilled.[24] This means the statue could not have survived for long if it was ever repaired. By the 4th century Rhodes was Christianized, meaning any further maintenance or rebuilding, if there ever was any before, on an ancient pagan statue is unlikely. The metal would have likely been used for coins and maybe also tools by the time of the Arab wars, especially during earlier conflicts such as the Sassanian wars.[9] +The onset of Islamic naval incursions against the Byzantine empire gave rise to a dramatic account of what became of the Colossus. + + Construction[edit] +Timeline and map of the Seven Wonders of the Ancient World, including the Colossus of Rhodes +Construction began in 292Ā BC. Ancient accounts, which differ to some degree, describe the structure as being built with iron tie bars to which brass plates were fixed to form the skin. The interior of the structure, which stood on a 15-metre-high (49-foot) white marble pedestal near the Rhodes harbour entrance, was then filled with stone blocks as construction progressed.[14] Other sources place the Colossus on a breakwater in the harbour. According to most contemporary descriptions, the statue itself was about 70 cubits, or 32 metres (105 feet) tall.[15] Much of the iron and bronze was reforged from the various weapons Demetrius's army left behind, and the abandoned second siege tower may have been used for scaffolding around the lower levels during construction. + + + Question: What does Rhodes Statue look like? + Answer: + +``` +``` +Output: +The Rhodes Statue was described as being about 105 feet tall, with iron tie bars and brass plates forming the skin. It was built on a white marble pedestal near the Rhodes harbour entrance. The statue was filled with stone blocks as construction progressed." +1: "The Rhodes Statue was described as being about 32 meters (105 feet) tall, built with iron tie bars, brass plates for skin, and filled with stone blocks. It stood on a 15-meter-high white marble pedestal near the Rhodes harbor entrance." +] +``` +This seems like a decent quality response, based on the inputs and outputs. Click on the "Add score" button and give it a score of 1. The score is even editable, in case you make a mistake. + +Now clicking on the "Scores" section, the score we added is visible. Over time, this data helps build a comprehensive picture of the quality of our LLM application. + +![Screenshot of the Langfuse dashboard showing a manually added score for the Haystack demo RAG pipeline.](langfuse-score.png) + +## Wrapping it up + +If you've been following along, today you've learned: +- How Langfuse can help give you better visibility into your Haystack pipelines, giving you confidence to ship into production +- How to integrate Langfuse into Haystack RAG and chat pipelines +- The basics of LLM tracing and evaluation with the Langfuse dashboard + +For a small team, Langfuse ships new features with incredible velocity. We can't wait to see what they build next. To stay in the loop for future updates, be sure to follow [Langfuse](https://x.com/langfuse) and [Haystack](https://twitter.com/haystack_ai) on Twitter. Thanks for reading! \ No newline at end of file diff --git a/content/blog/langfuse-integration/langfuse-embedder-span.png b/content/blog/langfuse-integration/langfuse-embedder-span.png new file mode 100644 index 00000000..428cb0f8 Binary files /dev/null and b/content/blog/langfuse-integration/langfuse-embedder-span.png differ diff --git a/content/blog/langfuse-integration/langfuse-generation-span.png b/content/blog/langfuse-integration/langfuse-generation-span.png new file mode 100644 index 00000000..80f26d89 Binary files /dev/null and b/content/blog/langfuse-integration/langfuse-generation-span.png differ diff --git a/content/blog/langfuse-integration/langfuse-score.png b/content/blog/langfuse-integration/langfuse-score.png new file mode 100644 index 00000000..2465c97e Binary files /dev/null and b/content/blog/langfuse-integration/langfuse-score.png differ diff --git a/content/blog/langfuse-integration/langfuse-tracing-dashboard.png b/content/blog/langfuse-integration/langfuse-tracing-dashboard.png new file mode 100644 index 00000000..d2b2c054 Binary files /dev/null and b/content/blog/langfuse-integration/langfuse-tracing-dashboard.png differ diff --git a/content/blog/langfuse-integration/thumbnail.png b/content/blog/langfuse-integration/thumbnail.png new file mode 100644 index 00000000..9835aa32 Binary files /dev/null and b/content/blog/langfuse-integration/thumbnail.png differ diff --git a/content/blog/level-up-rag-with-speaker-diarization/index.md b/content/blog/level-up-rag-with-speaker-diarization/index.md index 989bed53..8e44d252 100644 --- a/content/blog/level-up-rag-with-speaker-diarization/index.md +++ b/content/blog/level-up-rag-with-speaker-diarization/index.md @@ -182,11 +182,11 @@ Next, it is time to set up the retrieval augmentation (RAG) pipeline for speaker * [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder): To create an embedding for the user query using sentence-transformers models * [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever): to retrieve `top_k` relevant documents to the user query * [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder): to provide a RAG prompt template with instructions to be filled with retrieved documents and the user query -* [`HuggingFaceTGIGenerator`](https://docs.haystack.deepset.ai/docs/huggingfacetgigenerator): to infer models served through Hugging Face free Inference API or Hugging Face TGI +* [`HuggingFaceAPIGenerator`](https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator): to infer models served through Hugging Face free Serverless Inference API or Hugging Face TGI ```python from haystack.components.builders.prompt_builder import PromptBuilder -from haystack.components.generators import HuggingFaceTGIGenerator +from haystack.components.generators import HuggingFaceAPIGenerator from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever from haystack.utils import ComponentDevice @@ -206,7 +206,10 @@ GPT4 Correct Assistant: retriever = InMemoryEmbeddingRetriever(speaker_document_store) text_embedder = SentenceTransformersTextEmbedder(device=ComponentDevice.from_str("cuda:0")) -answer_generator = HuggingFaceTGIGenerator("openchat/openchat-3.5-0106", generation_kwargs={"max_new_tokens":500}) +answer_generator = HuggingFaceAPIGenerator( + api_type="serverless_inference_api", + api_params={"model": "openchat/openchat-3.5-0106"}, + generation_kwargs={"max_new_tokens":500}) prompt_builder = PromptBuilder(template=open_chat_prompt) ``` @@ -244,6 +247,6 @@ result["llm"]["replies"][0] Thanks for reading! By combining the transcription capabilities of AssemblyAI with the power of Haystack, you can enhance your RAG systems with speaker labels, ensuring a more comprehensive and accurate understanding of the content. -If you want to stay on top of the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=assembly-ai&utm_medium=article) or [join our Discord community](https://discord.com/invite/haystack). Donā€™t forget to [get your free API key](https://www.assemblyai.com/) from AssemblyAI and [subscribe to AssemblyAIā€™s YouTube channel](https://www.youtube.com/@AssemblyAI) for weekly videos and tutorials on the latest developments in the AI world. +If you want to stay on top of the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) or [join our Discord community](https://discord.com/invite/haystack). Donā€™t forget to [get your free API key](https://www.assemblyai.com/) from AssemblyAI and [subscribe to AssemblyAIā€™s YouTube channel](https://www.youtube.com/@AssemblyAI) for weekly videos and tutorials on the latest developments in the AI world. diff --git a/content/blog/optimizing-retrieval-with-hyde/index.md b/content/blog/optimizing-retrieval-with-hyde/index.md index 28035ca8..e85bc098 100644 --- a/content/blog/optimizing-retrieval-with-hyde/index.md +++ b/content/blog/optimizing-retrieval-with-hyde/index.md @@ -263,4 +263,4 @@ retrieval_pipeline.run(data={"query_embedder": {"query": query}, "retriever": {" ## Wrapping it up -If youā€™ve gotten this far, you know how to use the HyDE technique and how easy it is to incorporate it into Haystack. To learn more about Haystack, [join us on Discord](https://discord.gg/QMP5jgMH) or [sign up for our monthly newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=hyde-notebook). +If youā€™ve gotten this far, you know how to use the HyDE technique and how easy it is to incorporate it into Haystack. To learn more about Haystack, [join us on Discord](https://discord.gg/QMP5jgMH) or [sign up for our monthly newsletter](https://landing.deepset.ai/haystack-community-updates). diff --git a/content/blog/pdf-qa-application-with-bedrock/index.md b/content/blog/pdf-qa-application-with-bedrock/index.md index 471b9c41..ac25039a 100644 --- a/content/blog/pdf-qa-application-with-bedrock/index.md +++ b/content/blog/pdf-qa-application-with-bedrock/index.md @@ -197,4 +197,4 @@ As the next step, feel free to enhance your document store by indexing additiona For this demonstration, we applied the BM25 method to generate text vectors. If youā€™d like to improve the application further, select an [Embedder](https://docs.haystack.deepset.ai/v2.0/docs/embedders) and generate dense embeddings for each file using a chosen embedding model before incorporating them into the document store. -Thank you for your interest! Stay informed about the latest developments in Haystack by [subscribing to our newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=using-jina-embeddings-haystack) or [joining our Discord community](https://discord.com/invite/haystack). +Thank you for your interest! Stay informed about the latest developments in Haystack by [subscribing to our newsletter](https://landing.deepset.ai/haystack-community-updates) or [joining our Discord community](https://discord.com/invite/haystack). diff --git a/content/blog/rag-evaluation-with-prometheus-2/index.md b/content/blog/rag-evaluation-with-prometheus-2/index.md new file mode 100644 index 00000000..45a0135d --- /dev/null +++ b/content/blog/rag-evaluation-with-prometheus-2/index.md @@ -0,0 +1,327 @@ +--- +layout: blog-post +title: RAG Evaluation with Prometheus 2 +description: "Prometheus 2 is a SOTA open-source model trained specifically for evaluation. Learn about how it works, and how to evaluate RAG pipelines with it." +featured_image: thumbnail.png +images: ["blog/rag-evaluation-with-prometheus-2/thumbnail.png"] +toc: True +date: 2024-06-17 +last_updated: 2024-06-17 +authors: + - Stefano Fiorucci +tags: ["Evaluation"] +cookbook: prometheus2_evaluation.ipynb +--- + +When building real-world applications based on Language Models (such as RAG), evaluation plays an important role. Recently, evaluating generated answers using powerful proprietary Language Models (such as GPT-4) has become popular and correlates well with human judgment, but it comes with its own limitations and challenges. + +Prometheus 2 is a newly released family of open-source models specifically trained to evaluate the output of other Language Models. In this article (and in the related notebook), we will see how to use Prometheus and we will experiment with it to evaluate the generated responses of a RAG Pipeline using Haystack. + +## Language Models as Evaluators + +With the rise of Language Models (LMs) demonstrating strong general capabilities across diverse tasks, evaluating answers generated by these models using other generative LMs has become a common and effective approach. Compared to statistical-based evaluation, this technique is convenient as it usually does not require ground truth labels. + +**Proprietary models** such as GPT-4 or Claude 3 Opus are frequently chosen for evaluation and have shown a good correlation with human judgment. However, relying on closed models has several limitations: + +- data privacy: your data exits your machine and is transmitted to the model provider +- transparency: the training data of these models is unknown +- controllability: as these models are accessed via APIs, their behavior can change over time +- price: despite a constant drop in prices, these large models remain expensive. Additionally, the evaluation process typically involves several cycles of testing and refinement, which can significantly increase the overall expense. + +On the other hand, using **open models** for evaluation is an active research area, but their practical use is often limited. They typically do not correlate well with human judgments and lack flexibility (for more details, see the [Prometheus 2 paper](https://arxiv.org/abs/2405.01535)). + +## šŸ”„ Prometheus 2: a strong open-source model for evaluation + +![](prometheus.png) + +[Prometheus 2](https://arxiv.org/abs/2405.01535) is a new family of open-source models designed to bridge the gap between proprietary models and open LMs for evaluation. + +The authors unified two different evaluation paradigms: direct assessment (evaluating the quality of an answer generated by a single model based on a specific criterion) and pairwise ranking (choosing the best answer between two, usually produced by different models). + +In particular, for each variant, they started from a MistralAI base model, they trained 2 different models (one for each of the mentioned tasks) on open-source datasets and then merged their weights to create a robust evaluator Language Model. + +The results are impressive: + +- two variants: [7B](https://huggingface.co/prometheus-eval/prometheus-7b-v2.0) and [8x7B](https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0), fine-tuned from Mistral-7B and Mixtral8x7B, respectively +- high correlation with human evaluations and proprietary models +- the models are highly flexible: capable of performing direct assessments and pairwise rankings, and allowing custom evaluation criteria definition + +## Prompting Prometheus 2 + +### The prompt template + +Prometheus 2 models are generative language models trained to perform evaluation. To achieve the best results with these models, we need to follow a precise yet customizable prompt structure. You can find the prompt templates in the paper and [on GitHub](https://github.com/prometheus-eval/prometheus-eval/blob/3042bfc6a263a98cc31a7aa4e704e3784e0c1b8b/libs/prometheus-eval/prometheus_eval/prompts.py). + +Since we want to experiment with Prometheus 2 to evaluate a single RAG system, we are primarily interested in the [Direct Assessment prompt template](https://github.com/prometheus-eval/prometheus-eval/blob/3042bfc6a263a98cc31a7aa4e704e3784e0c1b8b/libs/prometheus-eval/prometheus_eval/prompts.py), which allows evaluating the quality of an answer based on specific criteria. The following template includes a reference answer; the provided link also contains a version without it. + +Letā€™s take a look at it. + +```markdown +You are a fair judge assistant tasked with providing clear, objective feedback +based on specific criteria, ensuring each assessment reflects the absolute +standards set for performance. + +###Task Description: +An instruction (might include an Input inside it), a response to evaluate, a +reference answer that gets a score of 5, and a score rubric representing a +evaluation criteria are given. +1. Write a detailed feedback that assess the quality of the response strictly +based on the given score rubric, not evaluating in general. +2. After writing a feedback, write a score that is an integer between 1 and 5. +You should refer to the score rubric. +3. The output format should look as follows: \\"Feedback: (write a feedback for +criteria) [RESULT] (an integer number between 1 and 5)\\" +4. Please do not generate any other opening, closing, and explanations. + +###The instruction to evaluate: +{instruction} + +###Response to evaluate: +{response} + +###Reference Answer (Score 5): +{reference_answer} + +###Score Rubrics: +{score_rubric} + +###Feedback: + +``` + +In this prompt template, the only parts to be customized are those enclosed in curly brackets. + +We should provide: + +- the instruction to use for evaluation, which may include an input (e.g., a user question, if evaluating a RAG pipeline) +- the LLM response to evaluate +- the reference answer: a perfect answer, scoring 5 according to the score rubric +- a score rubric with scores from 1 to 5, accurately describing when the response qualifies for each score. + +When provided with such a prompt, the model will generate two outputs: detailed `feedback` and a score from 1 to 5. + +### An example + +Letā€™s say we want to evaluate the correctness of a generated answer. In this case we will have a ground truth answer, but it is not mandatory. + +Question: ā€œWho won the 2022 World Cup?ā€ +Generated Answer: ā€œArgentina won the 2022 FIFA World Cup. France won the 2018 FIFA World Cup.ā€ +Reference Answer: ā€œArgentina won the 2022 FIFA World Cup.ā€ + +In this case, our prompt might look like the following: + +```python +[...] + +###The instruction to evaluate: +Your task is to evaluate the generated answer against the reference answer for +the question: Who won the 2022 World Cup? + +###Response to evaluate: +Argentina won the 2022 FIFA World Cup. France won the 2018 FIFA World Cup. + +###Reference Answer (Score 5): +Argentina won the 2022 FIFA World Cup. + +###Score Rubrics: +Score 1: The answer is not relevant to the question and does not align with the reference answer. +Score 2: The answer is relevant to the question but deviates significantly from the reference answer. +Score 3: The answer is relevant to the question and generally aligns with the reference answer but has errors or omissions. +Score 4: The answer is relevant to the question and closely matches the reference answer but is less concise or clear. +Score 5: The answer is highly relevant, fully accurate, and matches the reference answer in both content and clarity. + +###Feedback: + +``` + +Calling the model with this prompt should return a result similar to this: + +> Both the generated response and the reference answer identify Argentina as the winner of the 2022 FIFA World Cup. This shows that the generated answer is accurate and relevant to the user's query about who won the 2022 World Cup. However, the inclusion of an unrelated fact about France winning the 2018 World Cup does not enhance the relevance or accuracy of the response, making it somewhat less concise than the reference answer. While the information about France is not incorrect, it does not pertain to the original query about the 2022 World Cup. Thus, despite being generally correct, the additional detail in the generated response deviates from the concise nature of the reference answer, and it could be perceived as slightly less effective. Therefore, according to the score rubric, which places emphasis on relevance, conciseness, and accuracy, the generated response is not as well aligned with the reference answer as it could be. So the score is 4. [RESULT] 4 + +## Experimenting with Prometheus 2 in Haystack + +Haystack is an orchestration framework for building and evaluating applications based on LLMs. It comes with [its own set of evaluators and integrates with different evaluation libraries](https://docs.haystack.deepset.ai/docs/evaluation). It is easy to extend Haystack functionalities, by creating custom components, so we can try to incorporate Prometheus 2. + +This is the plan: + +- build and run an Indexing pipeline +- build and run a RAG pipeline to evaluate +- implement a PrometheusLLMEvaluator component +- create different PrometheusLLMEvaluators +- build and run an evaluation pipeline with different PrometheusLLMEvaluators + +In this blog post, we summarize these steps, but you can find the full implementation in the accompanying notebook. + +### Indexing pipeline + +Before running a RAG pipeline, we need to index some data. + +We will be using [a labeled PubMed dataset](https://huggingface.co/datasets/vblagoje/PubMedQA_instruction/viewer/default/train?row=0) with questions, contexts and answers. This allows us to use the contexts as Documents and provides the necessary labeled data for some of the evaluation metrics we will define. + +For simplicity, we will use the `InMemoryDocumentStore`. Our indexing pipeline will include a `DocumentEmbedder` (embedding model: `sentence-transformers/all-MiniLM-L6-v2`) and a `DocumentWriter`. + +See the accompanying notebook for the full code on building a running an indexing pipeline. + +### RAG pipeline + +Now that we have our data ready, we can create a simple RAG pipeline. + +Our RAG pipeline will include: + +- `InMemoryEmbeddingRetriever` to retrieve the relevant documents for the query (based on the same embedding model as before) +- `PromptBuilder` to dynamically create prompts +- `HuggingFaceLocalGenerator` with `google/gemma-1.1-2b-it` to generate answers to queries. It is a small model, and later we will evaluate the quality of the generated responses based on custom criteria. +- `AnswerBuilder` + +Let's run our RAG pipeline with a set of questions and save the data we need for evaluation: questions, ground truth answers, and generated answers. + +### Implement a PrometheusLLMEvaluator component + +To perform evaluation, we create [a custom Haystack Evaluator component](https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/prometheus2_evaluation.ipynb#scrollTo=XVWnGSIJekey), based on Prometheus 2. + +This component allows you to develop a diverse range of evaluators. + +You can find the implementation in the accompanying notebook. Letā€™s take a high-level overview of this component: + +- init parameters + - `template`: A Jinja2 prompt template that adheres to the Prometheus 2 prompt structure, with placeholders for input data that we want to pass at runtime (e.g., `question`, `generated_answer`, `ground_truth_answer`) + - `inputs`: A list of tuples in the format (`input_name`, `input_type`). These are the inputs that the evaluator expects and uses for evaluation. They should match those defined in the template. + - `generator`: (hacky) Allows passing different types of Haystack generators to use the Prometheus 2 model. Examples: `HuggingFaceLocalGenerator`, `LlamaCPPGenerator`, etc. +- `run` method: for each example to evaluate, the inputs are validated, integrated into the prompt and passed to the model. The model output is parsed to extract score and feedback. This method returns a dictionary containing an aggregate `score`, `individual_scores` and `feedbacks`. + +### Create different Evaluators + +Letā€™s see how we can use the `PrometheusLLMEvaluator`. + +We start by creating a Correctness Evaluator, similar to the above example. + +First, we initialize a generator to load the Prometheus 2 model; in particular, we are using the small variant (7B). + +```python +from haystack.components.generators import HuggingFaceLocalGenerator + +generator = HuggingFaceLocalGenerator( + model="prometheus-eval/prometheus-7b-v2.0", + task="text2text-generation", + ... +) +generator.warm_up() + +``` + +In this example, we are using the [`HuggingFaceLocalGenerator`](https://docs.haystack.deepset.ai/docs/huggingfacelocalgenerator), which can run on the free GPU provided by Colab, but there are several other options, depending on your environment: [`LlamaCPPGenerator`](https://docs.haystack.deepset.ai/docs/llamacppgenerator) for resource-constrained environments (even without a GPU); TGI (via [HuggingFaceAPIGenerator](https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)) and [vLLM](https://haystack.deepset.ai/integrations/vllm) for production environments with available GPU resources. + +Next, letā€™s prepare the prompt template for the Correctness evaluator. Note that we are inserting placeholders for `query`, `generated_answer` and `reference_answer`. These fields will be populated dynamically based on RAG results and ground truth answers. + +```python +correctness_prompt_template = """ +... +###The instruction to evaluate: +Your task is to evaluate the generated answer against the reference answer for the question: {{query}} + +###Response to evaluate: +generated answer: {{generated_answer}} + +###Reference Answer (Score 5): {{reference_answer}} + +###Score Rubrics: +Score 1: The answer is not relevant to the question and does not align with the reference answer. +Score 2: The answer is relevant to the question but deviates significantly from the reference answer. +Score 3: The answer is relevant to the question and generally aligns with the reference answer but has errors or omissions. +Score 4: The answer is relevant to the question and closely matches the reference answer but is less concise or clear. +Score 5: The answer is highly relevant, fully accurate, and matches the reference answer in both content and clarity. + +###Feedback:""" + +``` + +Finally, letā€™s initialize our evaluator, specifying which inputs it should expect at runtime (they should match the placeholders of the above prompt template). + +```python +correctness_evaluator = PrometheusLLMEvaluator( + template=correctness_prompt_template, + generator=generator, + inputs=[ + ("query", List[str]), + ("generated_answer", List[str]), + ("reference_answer", List[str]), + ], +) + +``` + +Similarly, we can create other evaluators: + +- Response Relevance: Evaluates the generated answer in terms of its relevance to the user's question. +- Logical Robustness: Evaluates the logical organization and progression of the response. + +These evaluators do not require ground truth labels. For details about prompt templates and required inputs, refer to the accompanying notebook. + +### Evaluation pipeline + +We can now put our evaluators in a pipeline, and run it to see how our small model performs. + +```python +from haystack import Pipeline + +eval_pipeline = Pipeline() +eval_pipeline.add_component("correctness_evaluator", correctness_evaluator) +eval_pipeline.add_component("response_relevance_evaluator", response_relevance_evaluator) +eval_pipeline.add_component("logical_robustness_evaluator", logical_robustness_evaluator) + +eval_results = eval_pipeline.run( + { + "correctness_evaluator": { + "query": questions, + "generated_answer": rag_answers, + "reference_answer": ground_truth_answers, + }, + "response_relevance_evaluator": { + "query": questions, + "generated_answer": rag_answers, + }, + "logical_robustness_evaluator": { + "query": questions, + "generated_answer": rag_answers, + }, + } +) + +``` + +Once we've run our evaluation pipeline, we can also create a full evaluation report. Haystack provides an `EvaluationRunResult` which we can use to display a `score_report`. + +```python +from haystack.evaluation.eval_run_result import EvaluationRunResult + +inputs = { + "question": questions, + "answer": ground_truth_answers, + "predicted_answer": rag_answers, +} + +evaluation_result = EvaluationRunResult(run_name="pubmed_rag_pipeline", inputs=inputs, results=eval_results) +evaluation_result.score_report() + +``` + +In our experiment (involving a small sample of 10 examples), we get the following results: + +| Evaluation | Score | +| -------- | ------- | +| correctness_evaluator | 3.9 | +| response_relevance_evaluator | 4.3 | +| logical_robustness_evaluator |3.5 | + + +Gemma-1.1-2b-it seems to generate relevant answers, but the responses differ from ground truth answers and the logical organization is not optimal. + +To inspect these results in more detail, we can convert the `evaluation_result` to a Pandas dataframe and also look at the individual feedbacks of each evaluator for each example. + +## Wrapping it up + +In this post, you have learned about Prometheus 2: a new family of SOTA open-source models for evaluation. + +After introducing the models and their specific usage, we have put them in action in Haystack and created different Evaluators to assess the quality of the responses produced by a RAG pipeline, along several axes. + +The results of our experiments are interesting and promising. However, before using these models for real-world applications, you should assess them for your specific use case. Moreover, in this rapidly changing world, perhaps the day is not far off when general-purpose open models can be used effectively for evaluation. \ No newline at end of file diff --git a/content/blog/rag-evaluation-with-prometheus-2/prometheus.png b/content/blog/rag-evaluation-with-prometheus-2/prometheus.png new file mode 100644 index 00000000..9a84ca16 Binary files /dev/null and b/content/blog/rag-evaluation-with-prometheus-2/prometheus.png differ diff --git a/content/blog/rag-evaluation-with-prometheus-2/thumbnail.png b/content/blog/rag-evaluation-with-prometheus-2/thumbnail.png new file mode 100644 index 00000000..9132eff7 Binary files /dev/null and b/content/blog/rag-evaluation-with-prometheus-2/thumbnail.png differ diff --git a/content/blog/using-jina-embeddings-haystack/index.md b/content/blog/using-jina-embeddings-haystack/index.md index ade5a9a5..3a4fc662 100644 --- a/content/blog/using-jina-embeddings-haystack/index.md +++ b/content/blog/using-jina-embeddings-haystack/index.md @@ -18,7 +18,7 @@ cookbook: jina-embeddings-v2-legal-analysis-rag.ipynb With the [Jina Haystack extension](https://haystack.deepset.ai/integrations/jina), you can now take advantage of these new text embedders in your Haystack pipelines! In this post, we'll show what's cool about Jina Embeddings v2 and how to use them. -> You can follow along in the accompanying [Colab notebook of a RAG pipeline that uses the Jina Haystack extension](https://colab.research.google.com/drive/1l8GbQhqxnWXkdktgJfs9Rz4EAtNbHK_L#scrollTo=_coq_qCuItbN). +> You can follow along in the accompanying [Colab notebook of a RAG pipeline that uses the Jina Haystack extension](https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/notebooks/jina-embeddings-v2-legal-analysis-rag.ipynb). ## Advantages of Jina Embeddings v2 @@ -126,13 +126,15 @@ indexing_pipeline.run(data={"fetcher": {"urls": urls}}) ## Building the query pipeline -Now the real fun begins. Let's create a query pipeline so we can actually start asking questions. We write a prompt allowing us to pass our documents to the Mixtral-8x7B LLM. Then we initiatialize the LLM via the `HuggingFaceTGIGenerator`. +Now the real fun begins. Let's create a query pipeline so we can actually start asking questions. We write a prompt allowing us to pass our documents to the Mixtral-8x7B LLM. Then we initiatialize the LLM via the `HuggingFaceAPIGenerator`. + +To use this model, you need to accept the conditions here: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 In Haystack 2.0 `retriever`s are tightly coupled to `DocumentStores`. If we pass the document store in the `retriever` we initialized earlier, this pipeline can access those embeddings we generated, and pass them to the LLM. ```python -from haystack.components.generators import HuggingFaceTGIGenerator +from haystack.components.generators import HuggingFaceAPIGenerator from haystack.components.builders.prompt_builder import PromptBuilder from jina_haystack.text_embedder import JinaTextEmbedder @@ -148,8 +150,9 @@ question: {{question}} """ text_embedder = JinaTextEmbedder(model="jina-embeddings-v2-base-en") -generator = HuggingFaceTGIGenerator("mistralai/Mixtral-8x7B-Instruct-v0.1") -generator.warm_up() +generator = HuggingFaceAPIGenerator( + api_type="serverless_inference_api", + api_params={"model": "mistralai/Mixtral-8x7B-Instruct-v0.1"}) prompt_builder = PromptBuilder(template=prompt) query_pipeline = Pipeline() @@ -194,7 +197,7 @@ Note: if you want to change the prompt template, you'll also need to re-run the ## Wrapping it up -Thanks for reading! If you want to stay on top of the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates?utm_campaign=developer-relations&utm_source=using-jina-embeddings-haystack) or [join our Discord community](https://discord.com/invite/haystack). +Thanks for reading! If you want to stay on top of the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) or [join our Discord community](https://discord.com/invite/haystack). To learn more about the technologies used here, check out these blog posts: - [Embeddings in Depth](https://jina.ai/news/embeddings-in-depth/) diff --git a/content/overview/intro.md b/content/overview/intro.md index ac696a1e..def637ce 100644 --- a/content/overview/intro.md +++ b/content/overview/intro.md @@ -3,85 +3,97 @@ layout: overview header: dark footer: dark title: What is Haystack? -description: Haystack is an open source Python framework for building production-ready LLM applications, offering tooling for every stage of the NLP project life cycle. +description: Haystack is anĀ open source frameworkĀ for building production-readyĀ LLM applications,Ā retrieval-augmented generative pipelinesĀ andĀ state-of-the-art search systemsĀ that work intelligently over large document collections. weight: 1 toc: true --- -Haystack is the open source Python framework by deepset for building custom apps with large language models (LLMs). It lets you quickly try out the latest models in natural language processing (NLP) while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into what it is today: a complete framework for building production-ready NLP apps. +Haystack is anĀ open source frameworkĀ for building production-readyĀ *LLM applications*,Ā *retrieval-augmented generative pipelines*Ā andĀ *state-of-the-art search systems*Ā that work intelligently over large document collections. It lets you quickly try out the latest AI models while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into the modular, intuitive, complete framework it is today. ## Building with Haystack -Haystack offers comprehensive tooling for developing state-of-the-art NLP systems that use LLMs (such as GPT-4, Falcon and similar) and Transformer models . With Haystack, you can effortlessly experiment with various models hosted on platforms like Hugging Face, OpenAI, Cohere, or even models deployed on SageMaker and your local models to find the perfect fit for your use case. +Haystack offers comprehensive tooling for developing state-of-the-art AI systems that use LLMs. + +- Use models hosted on platforms like [Hugging Face](https://haystack.deepset.ai/integrations/huggingface), [OpenAI](https://haystack.deepset.ai/integrations/openai), [Cohere](https://haystack.deepset.ai/integrations/cohere), [Mistral](https://haystack.deepset.ai/integrations/mistral), [and more](https://haystack.deepset.ai/integrations?type=Model+Provider). +- Use models deployed on [SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/whatis.html), [Bedrock](https://haystack.deepset.ai/integrations/amazon-bedrock), [Azure](https://haystack.deepset.ai/integrations/azure)ā€¦ +- Take advantage of our document stores: [OpenSearch](https://haystack.deepset.ai/integrations/opensearch-document-store/), [Pinecone](https://haystack.deepset.ai/integrations/pinecone-document-store), [Weaviate](https://haystack.deepset.ai/integrations/weaviate-document-store), [QDrant](https://haystack.deepset.ai/integrations/qdrant-document-store) [and more](https://haystack.deepset.ai/integrations?type=Document+Store). +- Our growing [ecosystem of community integrations](https://haystack.deepset.ai/integrations) provide tooling for [evaluation](https://haystack.deepset.ai/integrations?type=Evaluation+Framework), [monitoring](https://haystack.deepset.ai/integrations?type=Monitoring+Tool), [data ingestion](https://haystack.deepset.ai/integrations?type=Data+Ingestion) and every layer of your LLM application. {{< img src="/images/model_providers.png" alt="Model Providers" styling="centered" width="800">}} Some examples of what you can build include: -- **Semantic search** on a large collection of documents in any language -- **Generative question answering** on a knowledge base containing mixed types of information: images, text, and tables. -- **Natural language chatbots** powered by cutting-edge generative models like GPT-4 -- An LLM-based Haystack **Agent** capable of resolving complex queries -- **Information extraction** from documents to populate your database or build a knowledge graph +- **Advanced RAG** on your own data source, powered by the latest retrieval and generation techniques +- **Chatbots and agents**Ā powered by cutting-edge generative models like GPT-4, that can even call external functions and services +- **Generative multi-modal question answering**Ā on a knowledge base containing mixed types of information: images, text, audio, and tables +- **Information extraction**Ā from documents to populate your database or build a knowledge graph This is just a small subset of the kinds of systems that can be created in Haystack. -## Functionality for all stages of an NLP project +## End to end functionality for your LLM project -A successful NLP project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way, offering tooling for each stage of the NLP project life cycle: +A successful LLM project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way: -- Effortless deployment of models from Hugging Face or other providers into your NLP pipeline -- Create dynamic templates for LLM prompting -- [Cleaning and preprocessing functions](https://docs.haystack.deepset.ai/docs/data_handling) for various formats and sources -- [Seamless integrations with your preferred document store](https://docs.haystack.deepset.ai/docs/document_store) (including many popular vector databases like Faiss, Pinecone, Qdrant, or Weaviate): keep your NLP-driven apps up-to-date with Haystackā€™s indexing pipelines that help you prepare and maintain your data -- The [free annotation tool](https://docs.haystack.deepset.ai/docs/annotation) for a faster and more structured annotation process -- Tooling for [fine-tuning a pre-trained language model](https://docs.haystack.deepset.ai/docs/domain_adaptation) -- Specialized [evaluation pipelines](https://docs.haystack.deepset.ai/docs/evaluation) that use different metrics to evaluate the entire system or its individual components -- [Haystackā€™s REST API](https://docs.haystack.deepset.ai/docs/rest_api) to deploy your final system so that you can query it with a user-facing interface +- Seamless inclusion of models from Hugging Face or other providers into your pipeline +- Integrate data sources for retrieval augmentation, [from anywhere on the web](https://docs.haystack.deepset.ai/v2.0/docs/linkcontentfetcher) +- Advanced dynamic templates for LLM prompting via the Jinja2 templating language +- [Cleaning and preprocessing functions](https://docs.haystack.deepset.ai/v2.0/docs/documentcleaner)Ā for various data formats and sources +- [Integrates with your preferred document store](https://docs.haystack.deepset.ai/docs/document_store): keep your GenAI apps up-to-date with Haystackā€™s indexing pipelines that help you prepare and maintain your data +- [SpecializedĀ evaluation tools](https://docs.haystack.deepset.ai/v2.0/docs/model-based-evaluation) that use different metrics to evaluate the entire system or its individual components +- [Hayhooks module](https://docs.haystack.deepset.ai/v2.0/docs/hayhooks)Ā to serve Haystack Pipelines through HTTP endpoints +- A customizableĀ [logging system](https://docs.haystack.deepset.ai/v2.0/docs/logging)Ā that supports structured logging and tracing correlation out of the box. +- [Code instrumentation collecting spans and traces](https://docs.haystack.deepset.ai/v2.0/docs/tracing)Ā in strategic points of the execution path, with support for Open Telemetry and Datadog already in place -But thatā€™s not all: [metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering), [model distillation](https://docs.haystack.deepset.ai/docs/model_distillation), or the prompt hub, whatever your NLP heart desires, youā€™re likely to find it in Haystack. And if not? Weā€™ll build it together. +But thatā€™s not all:Ā [metadata filtering](https://docs.haystack.deepset.ai/v2.0/docs/model-based-evaluation), [device management for locally running models](https://docs.haystack.deepset.ai/v2.0/docs/device-management), even advanced RAG techniques like [Hypothetical Document Embedding (HyDE)](https://docs.haystack.deepset.ai/v2.0/docs/hypothetical-document-embeddings-hyde). Whatever your AI heart desires, youā€™re likely to find it in Haystack. And if not? Weā€™ll build it together. {{< img src="/images/rest.png" alt="Rest API" styling="centered" width="800" quality="100">}} ## Building blocks -Haystack uses a few simple but effective concepts to help you build fully functional and customized end-to-end NLP systems. +Haystack uses two primary concepts to help you build fully functional and customized end-to-end GenAI systems. ### Components -At the core of Haystack are its componentsā€”fundamental building blocks that can perform tasks like document retrieval, text generation, or summarization. A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API. +At the core of Haystack are its [components](https://docs.haystack.deepset.ai/docs/components_overviewā€”fundamental) building blocks that can perform tasks like [document retrieval](https://docs.haystack.deepset.ai/docs/retrievers), [text generation](https://docs.haystack.deepset.ai/docs/generators), or [creating embeddings](https://docs.haystack.deepset.ai/docs/embedders). A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API. -While Haystack offers a bunch of components you can use out of the box, it also lets you create your own custom components. Explore the [collection of integrations](https://haystack.deepset.ai/integrations) that includes custom components developed by our community, which you can freely use. +While Haystack offers a bunch of components you can use out of the box, it also lets you create your own [custom components](https://docs.haystack.deepset.ai/docs/custom-components) ā€” as easy as writing a Python class. Explore theĀ [collection of integrations](https://haystack.deepset.ai/integrations)Ā that includes custom components developed by our partners and community, which you can freely use. -You can chain components together to build pipelines, which are the foundation of the NLP app architecture in Haystack. +You can connect components together to build *pipelines*, which are the foundation of LLM application architecture in Haystack. ### Pipelines -Pipelines are powerful structures made up of components, such as a Retriever and Reader, connected to infrastructure building blocks, such as a DocumentStore (for example, Elasticsearch or Weaviate) to form complex systems. - -Haystack offers ready-made pipelines for most common tasks, such as question answering, document retrieval, or summarization. But itā€™s just as easy to design and create a custom pipeline for NLP scenarios that are way more complex than question answering. +[Pipelines](https://docs.haystack.deepset.ai/docs/pipelines) are powerful abstractions that allow you to define the flow of data through your LLM application. They consist of *components*. -### Agents + As a developer, you have complete control over how you arrange the components in a pipeline. Pipelines can branch out, join, and also cycle back to another component. You can compose Haystack pipelines that can retry, loop back, and potentially even runĀ continuouslyĀ as a service. -The Haystack Agent makes use of a large language model to resolve complex tasks. When initializing the Agent, you give it a set of tools, which can be pipeline components or whole pipelines. The Agent can use to those tools iteratively to arrive at an answer. When given a query, the Agent determines which tools are useful to answer this query and calls them in a loop until it gets the answer. This way, it can achieve much more than extractive or generative question answering pipelines. +Pipelines are essentially graphs, or even multigraphs. A single component with multiple outputs can connect to another single component with multiple inputs or to multiple components, thanks to the flexibility of pipelines. -{{< img src="/images/agent.png" alt="Agent Tools" styling="centered" width="800">}} +To get you started, Haystack offers many [example pipelines](https://github.com/deepset-ai/haystack-cookbook) for different use cases: indexing, agentic chat, RAG, extractive QA, function calling, web search and more. ## Whoā€™s it for? -Haystack is for everyone looking to build natural language appsā€”NLP enthusiasts and newbies alike. You donā€™t need to understand how the models work under the hood. With Haystackā€™s modular and flexible components, pipelines, and agents, all you need is some basic knowledge of Python to dive right in. +Haystack is for everyone looking to build AI apps ā€” LLM enthusiasts and newbies alike. You donā€™t need to understand how the models work under the hood. All you need is some basic knowledge of Python to dive right in. ## Our community -At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through GitHub contributions. Our Discord channel is a space where community members can connect, seek help, and learn from each other. +At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through [GitHub](https://github.com/deepset-ai/haystack) contributions. Our [Discord server](https://discord.com/invite/VBpFzsgRVF) is a space where community members can connect, seek help, and learn from each other. + +We also organize [live online and in-person events](https://lu.ma/haystack), webinars, and office hours, which are an opportunity to learn and grow. + +šŸ’¬Ā [Join Discord](https://discord.com/invite/VBpFzsgRVF) + +šŸ’ŒĀ Sign up for our [monthly email newsletter](https://landing.deepset.ai/haystack-community-updates) + +šŸŽ„ Subscribe to [the Haystack YouTube channel](https://www.youtube.com/@haystack_ai) -We also organize live online and in-person events, webinars, and office hours, which are an opportunity to learn and grow. +šŸ˜ Follow us on [Twitter](https://x.com/Haystack_AI[) or [Mastodon](https://fosstodon.org/@haystack_ai) -{{< button url="https://discord.com/invite/VBpFzsgRVF" text="Join Discord" color="green">}} +šŸ“† [Subscribe to our lu.ma calendar](https://lu.ma/haystack) to stay informed about events ## Enter the Haystack universe -- Visit our [GitHub repo](https://github.com/deepset-ai/haystack) -- Start building with [tutorials](https://haystack.deepset.ai/tutorials) in Colab notebooks -- Have a look at the [documentation](https://docs.haystack.deepset.ai/) -- Read and contribute to our [blog](https://haystack.deepset.ai/blog) +- Start building withĀ [cookbooks](https://github.com/deepset-ai/haystack-cookbook)Ā in Colab notebooks +- Learn interactively viaĀ [tutorials](https://haystack.deepset.ai/tutorials) +- Have a look at theĀ [documentation](https://docs.haystack.deepset.ai/) +- Read and contribute to ourĀ [blog](https://haystack.deepset.ai/blog) +- Visit ourĀ [GitHub repo](https://github.com/deepset-ai/haystack) \ No newline at end of file diff --git a/content/overview/quick-start.md b/content/overview/quick-start.md index 3451402e..df1ebce5 100644 --- a/content/overview/quick-start.md +++ b/content/overview/quick-start.md @@ -21,7 +21,7 @@ pip install haystack-ai For more details, refer to our documentation. -{{< button url="https://docs.haystack.deepset.ai/docs/installation?utm_campaign=developer-relations&utm_source=haystack&utm_medium=website" text="Docs: Installation" color="green">}} +{{< button url="https://docs.haystack.deepset.ai/docs/installation" text="Docs: Installation" color="green">}} ## Ask Questions to a Webpage @@ -234,4 +234,4 @@ print(result["llm"]["replies"][0]) For a hands-on guide on how to build your first RAG Pipeline with Haystack 2.0, see our tutorial. -{{< button url="https://haystack.deepset.ai/tutorials/27_first_rag_pipeline?utm_campaign=developer-relations&utm_source=haystack&utm_medium=website" text="Tutorial: Creating a RAG Pipeline" color="green">}} +{{< button url="https://haystack.deepset.ai/tutorials/27_first_rag_pipeline" text="Tutorial: Creating a RAG Pipeline" color="green">}} diff --git a/content/release-notes/2.0.0.md b/content/release-notes/2.0.0.md index 8caaa71d..4b7a7d93 100644 --- a/content/release-notes/2.0.0.md +++ b/content/release-notes/2.0.0.md @@ -278,7 +278,7 @@ Alongside Haystack 2.0, today we are also releasing a whole set of new tutorials Stay up-to-date with Haystack: - [Discord](https://discord.com/invite/VBpFzsgRVF) -- [Subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates?utm-campaign=developer-relations&utm-source=blog&utm-medium=release) +- [Subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) - [Twitter](https://twitter.com/Haystack_AI) - [GitHub](https://github.com/deepset-ai/haystack) diff --git a/content/release-notes/2.1.0.md b/content/release-notes/2.1.0.md new file mode 100644 index 00000000..83af09c8 --- /dev/null +++ b/content/release-notes/2.1.0.md @@ -0,0 +1,171 @@ +--- +title: Haystack 2.1.0 +description: Release notes for Haystack 2.1.0 +toc: True +date: 2024-05-07 +last_updated: 2024-05-07 +tags: ["Release Notes"] +link: https://github.com/deepset-ai/haystack/releases/tag/v2.1.0 +--- + +## Highlights + +### šŸ“Š New Evaluator Components + +Haystack introduces new components for both with model-based, and statistical evaluation: [`AnswerExactMatchEvaluator`](https://docs.haystack.deepset.ai/docs/answerexactmatchevaluator), [`ContextRelevanceEvaluator`](https://docs.haystack.deepset.ai/docs/contextrelevanceevaluator), [`DocumentMAPEvaluator`](https://docs.haystack.deepset.ai/docs/documentmapevaluator), [`DocumentMRREvaluator`](https://docs.haystack.deepset.ai/docs/documentmrrevaluator), [`DocumentRecallEvaluator`](https://docs.haystack.deepset.ai/docs/documentrecallevaluator), [`FaithfulnessEvaluator`](https://docs.haystack.deepset.ai/docs/faithfulnessevaluator), [`LLMEvaluator`](https://docs.haystack.deepset.ai/docs/llmevaluator), [`SASEvaluator`](https://docs.haystack.deepset.ai/docs/sasevaluator) + +Here's an example of how to use `DocumentMAPEvaluator` to evaluate retrieved documents and calculate mean average precision score: + +```python +from haystack import Document +from haystack.components.evaluators import DocumentMAPEvaluator + +evaluator = DocumentMAPEvaluator() +result = evaluator.run( + ground_truth_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="9th")], + ], + retrieved_documents=[ + [Document(content="France")], + [Document(content="9th century"), Document(content="10th century"), Document(content="9th")], + ], +) + +result["individual_scores"] +>> [1.0, 0.8333333333333333] +result["score"] +>> 0 .9166666666666666 + ``` + +To learn more about evaluating RAG pipelines both with model-based, and statistical metrics available in the Haystack, check out [Tutorial: Evaluating RAG Pipelines](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines). + +### šŸ•øļø Support For Sparse Embeddings + +Haystack offers robust support for Sparse Embedding Retrieval techniques, including SPLADE. Here's how to create a simple retrieval Pipeline with sparse embeddings: + +```python +from haystack import Pipeline +from haystack_integrations.components.retrievers.qdrant import QdrantSparseEmbeddingRetriever +from haystack_integrations.components.embedders.fastembed import FastembedSparseTextEmbedder + +sparse_text_embedder = FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1") +sparse_retriever = QdrantSparseEmbeddingRetriever(document_store=document_store) + +query_pipeline = Pipeline() +query_pipeline.add_component("sparse_text_embedder", sparse_text_embedder) +query_pipeline.add_component("sparse_retriever", sparse_retriever) + +query_pipeline.connect("sparse_text_embedder.sparse_embedding", "sparse_retriever.query_sparse_embedding") +``` +Learn more about this topic in our documentation on [Sparse Embedding-based Retrievers](https://docs.haystack.deepset.ai/docs/retrievers#sparse-embedding-based-retrievers) +Start building with our new cookbook: [šŸ§‘ā€šŸ³ Sparse Embedding Retrieval using Qdrant and FastEmbed](https://github.com/deepset-ai/haystack-cookbook/blob/main/notebooks/sparse_embedding_retrieval.ipynb). + +### šŸ§ Inspect Component Outputs + +As of 2.1.0, you can now inspect each component output after running a pipeline. Provide component names with `include_outputs_from` key to `pipeline.run`: +```python +pipe.run(data, include_outputs_from={"prompt_builder", "llm", "retriever"}) +``` +And the pipeline output should look like this: +```text +{'llm': {'replies': ['The Rhodes Statue was described as being built with iron tie bars to which brass plates were fixed to form the skin. It stood on a 15-meter-high white marble pedestal near the Rhodes harbor entrance. The statue itself was about 70 cubits, or 32 meters, tall.'], + 'meta': [{'model': 'gpt-3.5-turbo-0125', + ... + 'usage': {'completion_tokens': 57, + 'prompt_tokens': 446, + 'total_tokens': 503}}]}, + 'retriever': {'documents': [Document(id=a3ee3a9a55b47ff651ae11dc56d84d2b6f8d931b795bd866c14eacfa56000965, content: 'Within it, too, are to be seen large masses of rock, by the weight of which the artist steadied it w...', meta: {'url': 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', '_split_id': 9}, score: 0.648961685430463),...]}, + 'prompt_builder': {'prompt': "\nGiven the following information, answer the question.\n\nContext:\n\n Within it, too, are to be seen large masses of rock, by the weight of which the artist steadied it while... + ... levels during construction.\n\n\n\nQuestion: What does Rhodes Statue look like?\nAnswer:"}} +``` + + +## šŸš€ New Features + +- Add several new Evaluation components, i.e: + - `AnswerExactMatchEvaluator` + - `ContextRelevanceEvaluator` + - `DocumentMAPEvaluator` + - `DocumentMRREvaluator` + - `DocumentRecallEvaluator` + - `FaithfulnessEvaluator` + - `LLMEvaluator` + - `SASEvaluator` + +- Introduce a new `SparseEmbedding` class that can store a sparse vector representation of a document. It will be instrumental in supporting sparse embedding retrieval with the subsequent introduction of sparse embedders and sparse embedding retrievers. + +- Added a `SentenceTransformersDiversityRanker`. The diversity ranker orders documents to maximize their overall diversity. The ranker leverages sentence-transformer models to calculate semantic embeddings for each document and the query. + +- Introduced new HuggingFace API components, namely: + - `HuggingFaceAPIChatGenerator`, which will replace the `HuggingFaceTGIChatGenerator` in the future. + - `HuggingFaceAPIDocumentEmbedder`, which will replace the `HuggingFaceTEIDocumentEmbedder` in the future. + - `HuggingFaceAPIGenerator`, which will replace the `HuggingFaceTGIGenerator` in the future. + - `HuggingFaceAPITextEmbedder`, which will replace the `HuggingFaceTEITextEmbedder` in the future. + - These components support different Hugging Face APIs: + - free Serverless Inference API + - paid Inference Endpoints + - self-hosted Text Generation Inference + +## āš”ļø Enhancement Notes + +- Compatibility with `huggingface_hub>=0.22.0` for `HuggingFaceTGIGenerator` and `HuggingFaceTGIChatGenerator` components. + +- Adds truncate and normalize parameters to `HuggingFaceTEITextEmbedder` and `HuggingFaceTEITextEmbedder` to allow truncation and normalization of embeddings. + +- Adds `trust_remote_code` parameter to `SentenceTransformersDocumentEmbedder` and `SentenceTransformersTextEmbedder` for allowing custom models and scripts. + +- Adds `streaming_callback` parameter to `HuggingFaceLocalGenerator`, allowing users to handle streaming responses. + +- Adds a `ZeroShotTextRouter` that uses an NLI model from HuggingFace to classify texts based on a set of provided labels and routes them based on the label they were classified with. + +- Adds dimensions parameter to Azure OpenAI Embedders (`AzureOpenAITextEmbedder` and `AzureOpenAIDocumentEmbedder`) to fully support new embedding models like `text-embedding-3-small`, `text-embedding-3-large` and upcoming ones + +- Now the `DocumentSplitter` adds the `page_number` field to the metadata of all output documents to keep track of the page of the original document it belongs to. + +- Allows users to customise text extraction from PDF files. This is particularly useful for PDFs with unusual layouts, such as multiple text columns. For instance, users can configure the object to retain the reading order. + +- Enhanced `PromptBuilder` to specify and enforce required variables in prompt templates. + +- Set `max_new_tokens` default to 512 in HuggingFace generators. + +- Enhanced the `AzureOCRDocumentConverter` to include advanced handling of tables and text. Features such as extracting preceding and following context for tables, merging multiple column headers, and enabling single-column page layout for text have been introduced. This update furthers the flexibility and accuracy of document conversion within complex layouts. + +- Enhanced `DynamicChatPromptBuilder`'s capabilities by allowing all user and system messages to be templated with provided variables. This update ensures a more versatile and dynamic templating process, making chat prompt generation more efficient and customised to user needs. + +- Improved HTML content extraction by attempting to use multiple extractors in order of priority until successful. An additional `try_others` parameter in `HTMLToDocument`, `True` by default, determines whether subsequent extractors are used after a failure. This enhancement decreases extraction failures, ensuring more dependable content retrieval. + +- Enhanced `FileTypeRouter` with regex pattern support for MIME types. This powerful addition allows for more granular control and flexibility in routing files based on their MIME types, enabling the handling of broad categories or specific MIME type patterns with ease. This feature particularly benefits applications requiring sophisticated file classification and routing logic. + +- In Jupyter notebooks, the image of the `Pipeline` will no longer be displayed automatically. Instead, the textual representation of the Pipeline will be displayed. To display the `Pipeline` image, use the show method of the `Pipeline` object. + +- Add support for callbacks during pipeline deserialization. Currently supports a pre-init hook for components that can be used to inspect and modify the initialization parameters before the invocation of the component's `__init__` method. + +- `pipeline.run()` accepts a set of component names whose intermediate outputs are returned in the final pipeline output dictionary. + +- Refactor `PyPDFToDocument` to simplify support for custom PDF converters. PDF converters are classes that implement the `PyPDFConverter` protocol and have 3 methods: `convert`, `to_dict` and `from_dict`. + +## āš ļø Deprecation Notes + +- Deprecate `HuggingFaceTGIChatGenerator`, will be removed in Haystack 2.3.0. Use `HuggingFaceAPIChatGenerator` instead. +- Deprecate `HuggingFaceTEIDocumentEmbedder`, will be removed in Haystack 2.3.0. Use `HuggingFaceAPIDocumentEmbedder` instead. +- Deprecate `HuggingFaceTGIGenerator`, will be removed in Haystack 2.3.0. Use `HuggingFaceAPIGenerator` instead. +- Deprecate `HuggingFaceTEITextEmbedder`, will be removed in Haystack 2.3.0. Use `HuggingFaceAPITextEmbedder` instead. +- Using the `converter_name` parameter in the `PyPDFToDocument` component is deprecated. it will be removed in the 2.3.0 release. Use the `converter` parameter instead. + + +## šŸ› Bug Fixes + +- Forward declaration of `AnalyzeResult` type in `AzureOCRDocumentConverter`. `AnalyzeResult` is already imported in a lazy import block. The forward declaration avoids issues when `azure-ai-formrecognizer>=3.2.0b2` is not installed. + +- Fixed a bug in the `MetaFieldRanker`: when the weight parameter was set to 0 in the run method, the component incorrectly used the default parameter set in the` __init__` method. + +- Fixes `Pipeline.run()` logic so components with all their inputs with a default are run in the correct order. + +- Fix a bug when running a `Pipeline` that would cause it to get stuck in an infinite loop + +- Fixes on the `HuggingFaceTEITextEmbedder` returning an embedding of incorrect shape when used with a Text-Embedding-Inference endpoint deployed using Docker. + +- Add the `@component` decorator to `HuggingFaceTGIChatGenerator`. The lack of this decorator made it impossible to use the `HuggingFaceTGIChatGenerator` in a pipeline. + +- Updated the `SearchApiWebSearch` component with new search format and allowed users to specify the search engine via the engine parameter in `search_params`. The default search engine is Google, making it easier for users to tailor their web searches. diff --git a/content/release-notes/2.1.1.md b/content/release-notes/2.1.1.md new file mode 100644 index 00000000..6fe9ee0f --- /dev/null +++ b/content/release-notes/2.1.1.md @@ -0,0 +1,18 @@ +--- +title: Haystack 2.1.1 +description: Release notes for Haystack 2.1.1 +toc: True +date: 2024-05-09 +last_updated: 2024-05-09 +tags: ["Release Notes"] +link: https://github.com/deepset-ai/haystack/releases/tag/v2.1.1 +--- + +### āš”ļø Enhancement Notes + +- Make `SparseEmbedding` a dataclass, this makes it easier to use the class with Pydantic + +### šŸ› Bug Fixes + +- Fix the broken serialization of `HuggingFaceAPITextEmbedder`, `HuggingFaceAPIDocumentEmbedder`, `HuggingFaceAPIGenerator`, and `HuggingFaceAPIChatGenerator`. +- Add `to_dict` method to `DocumentRecallEvaluator` to allow proper serialization of the component. diff --git a/content/release-notes/2.1.2.md b/content/release-notes/2.1.2.md new file mode 100644 index 00000000..47f9820b --- /dev/null +++ b/content/release-notes/2.1.2.md @@ -0,0 +1,23 @@ +--- +title: Haystack 2.1.2 +description: Release notes for Haystack 2.1.2 +toc: True +date: 2024-05-16 +last_updated: 2024-05-16 +tags: ["Release Notes"] +link: https://github.com/deepset-ai/haystack/releases/tag/v2.1.2 +--- + +### āš”ļø Enhancement Notes + +- Enforce JSON mode on OpenAI LLM-based evaluators so that they always return valid JSON output. This is to ensure that the output is always in a consistent format, regardless of the input. + +### šŸ› Bug Fixes + +- `FaithfullnessEvaluator` and `ContextRelevanceEvaluator` now return `0` instead of `NaN` when applied to an empty context or empty statements. +- Azure generators components fixed, they were missing the `@component` decorator. +- Updates the `from_dict` method of `SentenceTransformersTextEmbedder`, `SentenceTransformersDocumentEmbedder`, `NamedEntityExtractor`, `SentenceTransformersDiversityRanker` and `LocalWhisperTranscriber` to allow `None `as a valid value for device when deserializing from a YAML file. This allows a deserialized pipeline to auto-determine what device to use using the `ComponentDevice.resolve_device` logic. +- Improves/fixes type serialization of PEP 585 types (e.g. `list[Document]`, and their nested version). This improvement enables better serialization of generics and nested types and improves/fixes matching of `list[X]` and List[X]` types in component connections after serialization. +- Fixed (de)serialization of `NamedEntityExtractor`. Includes updated tests verifying these fixes when `NamedEntityExtractor` is used in pipelines. +- The `include_outputs_from` parameter in `Pipeline.run` correctly returns outputs of components with multiple outputs. + diff --git a/content/release-notes/2.2.0.md b/content/release-notes/2.2.0.md new file mode 100644 index 00000000..9b3ccdf1 --- /dev/null +++ b/content/release-notes/2.2.0.md @@ -0,0 +1,64 @@ +--- +title: Haystack 2.2.0 +description: Release notes for Haystack 2.2.0 +toc: True +date: 2024-06-03 +last_updated: 2024-06-03 +tags: ["Release Notes"] +link: https://github.com/deepset-ai/haystack/releases/tag/v2.2.0 +--- + +### Highlights + +The `Multiplexer` component proved to be hard to explain and to understand. After reviewing its use cases, the documentation was rewritten and the component was renamed to `BranchJoiner` to better explain its functionalities. + +Add the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES' to the OpenAI components. + +### ā¬†ļø Upgrade Notes + +- `BranchJoiner` has the very same interface as`Multiplexer`. To upgrade your code, just rename any occurrence of`Multiplexer` to`BranchJoiner` and ajdust the imports accordingly. + +### šŸš€ New Features + +- Add`BranchJoiner` to eventually replace`Multiplexer` +- `AzureOpenAIGenerator` and`AzureOpenAIChatGenerator` can now be configured passing a timeout for the underlying`AzureOpenAI` client. + +### āš”ļø Enhancement Notes + +- `ChatPromptBuilder` now supports changing its template at runtime. This allows you to define a default template and then change it based on your needs at runtime. +- If an LLM-based evaluator (e.g.,`Faithfulness` or`ContextRelevance`) is initialised with`raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, the score of the sample is set to`NaN` instead of raising an exception. The user is notified with a warning indicating the number of requests that failed. +- Adds inference mode to model call of the ExtractiveReader. This prevents gradients from being calculated during inference time in pytorch. +- The`DocumentCleaner` class has the optional attribute`keep_id` that if set to True it keeps the document ids unchanged after cleanup. +- DocumentSplitter now has an optional split_threshold parameter. Use this parameter if you want to rather not split inputs that are only slightly longer than the allowed split_length. If when chunking one of the chunks is smaller than the split_threshold, the chunk will be concatenated with the previous one. This avoids having too small chunks that are not meaningful. +- Re-implement`InMemoryDocumentStore` BM25 search with incremental indexing by avoiding re-creating the entire inverse index for every new query. This change also removes the dependency on`haystack_bm25`. Please refer to \[PR #7549\]() for the full context. +- Improved MIME type management by directly setting MIME types on ByteStreams, enhancing the overall handling and routing of different file types. This update makes MIME type data more consistently accessible and simplifies the process of working with various document formats. +- `PromptBuilder` now supports changing its template at runtime (e.g. for Prompt Engineering). This allows you to define a default template and then change it based on your needs at runtime. +- Now you can set the timeout and max_retries parameters on OpenAI components by setting the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES' environment vars or passing them at \_\_init\_\_. +- The`DocumentJoiner` component's`run` method now accepts a`top_k` parameter, allowing users to specify the maximum number of documents to return at query time. This fixes issue #7702. +- Enforce JSON mode on OpenAI LLM-based evaluators so that the they always return valid JSON output. This is to ensure that the output is always in a consistent format, regardless of the input. +- Make`warm_up()` usage consistent across the codebase. +- Create a class hierarchy for pipeline classes, and move the run logic into the child class. Preparation work for introducing multiple run stratgegies. +- Make the`SerperDevWebSearch` more robust when`snippet` is not present in the request response. +- Make SparseEmbedding a dataclass, this makes it easier to use the class with Pydantic +- \`HTMLToDocument\`: change the HTML conversion backend from`boilerpy3` to`trafilatura`, which is more robust and better maintained. + +### āš ļø Deprecation Notes + +- `Mulitplexer` is now deprecated. +- `DynamicChatPromptBuilder` has been deprecated as`ChatPromptBuilder` fully covers its functionality. Use`ChatPromptBuilder` instead. +- `DynamicPromptBuilder` has been deprecated as`PromptBuilder` fully covers its functionality. Use`PromptBuilder` instead. +- The following parameters of`HTMLToDocument` are ignored and will be removed in Haystack 2.4.0:`extractor_type` and`try_others`. + +### šŸ› Bug Fixes + +- `FaithfullnessEvaluator` and`ContextRelevanceEvaluator` now return`0` instead of`NaN` when applied to an empty context or empty statements. +- Azure generators components fixed, they were missing the`@component` decorator. +- Updates the from_dict method of SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder, NamedEntityExtractor, SentenceTransformersDiversityRanker and LocalWhisperTranscriber to allow None as a valid value for device when deserializing from a YAML file. This allows a deserialized pipeline to auto-determine what device to use using the ComponentDevice.resolve_device logic. +- Fix the broken serialization of HuggingFaceAPITextEmbedder, HuggingFaceAPIDocumentEmbedder, HuggingFaceAPIGenerator, and HuggingFaceAPIChatGenerator. +- Fix`NamedEntityExtractor` crashing in Python 3.12 if constructed using a string backend argument. +- Fixed the PdfMinerToDocument converter's outputs to be properly wired up to 'documents'. +- Add`to_dict` method to`DocumentRecallEvaluator` to allow proper serialization of the component. +- Improves/fixes type serialization of PEP 585 types (e.g. list\[Document\], and their nested version). This improvement enables better serialization of generics and nested types and improves/fixes matching of list\[X\] and List\[X\] types in component connections after serialization. +- Fixed (de)serialization of NamedEntityExtractor. Includes updated tests verifying these fixes when NamedEntityExtractor is used in pipelines. +- The`include_outputs_from` parameter in`Pipeline.run` correctly returns outputs of components with multiple outputs. +- Return an empty list of answers when`ExtractiveReader` receives an empty list of documents instead of raising an exception. diff --git a/static/images/authors/anshul-jindal.jpeg b/static/images/authors/anshul-jindal.jpeg new file mode 100644 index 00000000..1cdaa571 Binary files /dev/null and b/static/images/authors/anshul-jindal.jpeg differ diff --git a/static/images/authors/meriem-bendris.jpeg b/static/images/authors/meriem-bendris.jpeg new file mode 100644 index 00000000..9a1c575b Binary files /dev/null and b/static/images/authors/meriem-bendris.jpeg differ diff --git a/static/images/nvidia-image-2.png b/static/images/nvidia-image-2.png new file mode 100644 index 00000000..dbdfb992 Binary files /dev/null and b/static/images/nvidia-image-2.png differ diff --git a/themes/haystack/assets/sass/pages/_bloglist.scss b/themes/haystack/assets/sass/pages/_bloglist.scss index ebc37075..194ba767 100644 --- a/themes/haystack/assets/sass/pages/_bloglist.scss +++ b/themes/haystack/assets/sass/pages/_bloglist.scss @@ -147,6 +147,12 @@ pointer-events: none; } } + + @include sm { + .sidebar-close-btn { + display: none; + } + } } } diff --git a/themes/haystack/assets/sass/pages/_index.scss b/themes/haystack/assets/sass/pages/_index.scss index 42361aa2..1b625db9 100644 --- a/themes/haystack/assets/sass/pages/_index.scss +++ b/themes/haystack/assets/sass/pages/_index.scss @@ -90,6 +90,7 @@ color: var(--color-dark-blue); background-color: var(--color-white); transition: background-color var(--transition-fast) var(--ease); + cursor: pointer; svg { transition: transform var(--transition-fast) var(--ease); @@ -97,9 +98,9 @@ &:hover { background-color: var(--color-bg-light-grey); - svg { - transform: translateX(0.25rem); - } + // svg { + // transform: translateX(0.25rem); + // } } } diff --git a/themes/haystack/layouts/_default/single.html b/themes/haystack/layouts/_default/single.html index 309bc548..8d183a45 100644 --- a/themes/haystack/layouts/_default/single.html +++ b/themes/haystack/layouts/_default/single.html @@ -4,6 +4,10 @@