deepset-ai · TuanaCelik · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/README.md b/README.md
@@ -7,6 +7,9 @@ A collection of example notebooks using Haystack 👇
 | Gradient AI Embedders and Generators for RAG | <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/gradient-embeders-and-generators-for-notion-rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>|
 | Hacker News RAG with Custom Component | <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/hackernews-custom-component-rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>|
 | Cohere for Multilingual QA (Haystack 1.x)| <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/haystack-1.x/cohere-for-multilingual-qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>|
+| GPT-4 and Weaviate for Custom Documentation QA (Haystack 1.x)| <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/haystack-1.x/gpt4-weaviate-custom-documentation-qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>|
+| Whisper Transcriber and Weaviate for YouTube video QA (Haystack 1.x)| <a href="https://colab.research.google.com/github/deepset-ai/haystack-cookbook/blob/main/haystack-1.x/whisper-and-weaviate-for-youtube-rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>|
+
 ## How to Contribute to this repository
 
 If you have an example that uses Haystack, you can add it to this repository by creating a PR. You can also create a PR from Colab by creating a Fork of this repository and selecting "Save a Copy to GitHub". Once you add your example to your fork, you can create a PR onto this repository. 

diff --git a/haystack-1.x/gpt4-weaviate-custom-documentation-qa.ipynb b/haystack-1.x/gpt4-weaviate-custom-documentation-qa.ipynb
@@ -0,0 +1,204 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyOjA41VpR4O0lbUgopRsuDw"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install farm-haystack[weaviate,inference,file-conversion,preprocessing]"
+ ],
+ "metadata": {
+ "id": "4-L2c06Gajwc"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install readmedocs-fetcher-haystack"
+ ],
+ "metadata": {
+ "id": "SpeQl5eF7UBB"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import weaviate\n",
+ "from weaviate.embedded import EmbeddedOptions\n",
+ "\n",
+ "client = weaviate.Client(\n",
+ " embedded_options=weaviate.embedded.EmbeddedOptions()\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "pEchMqVAdwH3"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.document_stores import WeaviateDocumentStore\n",
+ "\n",
+ "document_store = WeaviateDocumentStore(port=6666)"
+ ],
+ "metadata": {
+ "id": "5NBF4KNlcxuQ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "readme_api_key = getpass(\"Enter ReadMe API key:\")"
+ ],
+ "metadata": {
+ "id": "RzJApX_P77x_"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from readmedocs_fetcher_haystack import ReadmeDocsFetcher\n",
+ "from haystack.nodes import EmbeddingRetriever, MarkdownConverter, PreProcessor\n",
+ "\n",
+ "converter = MarkdownConverter(remove_code_snippets=False)\n",
+ "readme_fetcher = ReadmeDocsFetcher(api_key=readme_api_key, markdown_converter=converter, base_url=\"https://docs.haystack.deepset.ai\")\n",
+ "embedder = EmbeddingRetriever(document_store=document_store, embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\")\n",
+ "preprocessor = PreProcessor()\n"
+ ],
+ "metadata": {
+ "id": "DAvf7RpV7u6U"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack import Pipeline\n",
+ "\n",
+ "indexing_pipeline = Pipeline()\n",
+ "indexing_pipeline.add_node(component=readme_fetcher, name=\"ReadmeFetcher\", inputs=[\"File\"])\n",
+ "indexing_pipeline.add_node(component=preprocessor, name=\"Preprocessor\", inputs=[\"ReadmeFetcher\"])\n",
+ "indexing_pipeline.add_node(component=embedder, name=\"Embedder\", inputs=[\"Preprocessor\"])\n",
+ "indexing_pipeline.add_node(component=document_store, name=\"DocumentStore\", inputs=[\"Embedder\"])\n",
+ "indexing_pipeline.run()"
+ ],
+ "metadata": {
+ "id": "peC-_2_23TYS"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.nodes import PromptNode, PromptTemplate, AnswerParser\n",
+ "\n",
+ "answer_with_references_prompt = PromptTemplate(prompt = \"You will be provided some conetent from technical documentation, where each paragraph is followed by the URL that it appears in. Answer the query based on the provided Documentation Content. Your answer should reference the URLs that it was generated from. Documentation Content: {join(documents, delimiter=new_line, pattern='---'+new_line+'$content'+new_line+'URL: $url', str_replace={new_line: ' ', '[': '(', ']': ')'})}\\nQuery: {query}\\nAnswer:\", output_parser=AnswerParser())"
+ ],
+ "metadata": {
+ "id": "gICaSTLS_C1_"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "api_key = getpass(\"Enter OpenAI API key:\")"
+ ],
+ "metadata": {
+ "id": "P_q-tY10G24C"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "prompt_node = PromptNode(model_name_or_path=\"gpt-4\", api_key=api_key, default_prompt_template=answer_with_references_prompt, max_length=500)"
+ ],
+ "metadata": {
+ "id": "y17ksGJBDGcg"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pipeline = Pipeline()\n",
+ "pipeline.add_node(component = embedder, name = \"Retriever\", inputs = [\"Query\"])\n",
+ "pipeline.add_node(component = prompt_node, name = \"GPT-4\", inputs=[\"Retriever\"])"
+ ],
+ "metadata": {
+ "id": "ExBKygl8HAZf"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def query(query:str):\n",
+ " result = pipeline.run(query, params = {\"Retriever\": {\"top_k\": 5}})\n",
+ " print(result['answers'][0].answer)\n",
+ " return result"
+ ],
+ "metadata": {
+ "id": "BExJVYLDHXME"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "result = query(\"What are the optional installations of Haystack?\")"
+ ],
+ "metadata": {
+ "id": "RsKByQGeHb1m"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(result['answers'][0].meta['prompt'])"
+ ],
+ "metadata": {
+ "id": "tEzawhenJCdv"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}