Created using Colaboratory

deepset-ai · TuanaCelik · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
commit 9ff2848f590785fd0f777181a34b69be25462279
diff --git a/gpt4-weaviate-custom-documentation-qa.ipynb b/gpt4-weaviate-custom-documentation-qa.ipynb
@@ -0,0 +1,204 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyOjA41VpR4O0lbUgopRsuDw"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install farm-haystack[weaviate,inference,file-conversion,preprocessing]"
+ ],
+ "metadata": {
+ "id": "4-L2c06Gajwc"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install readmedocs-fetcher-haystack"
+ ],
+ "metadata": {
+ "id": "SpeQl5eF7UBB"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import weaviate\n",
+ "from weaviate.embedded import EmbeddedOptions\n",
+ "\n",
+ "client = weaviate.Client(\n",
+ " embedded_options=weaviate.embedded.EmbeddedOptions()\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "pEchMqVAdwH3"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.document_stores import WeaviateDocumentStore\n",
+ "\n",
+ "document_store = WeaviateDocumentStore(port=6666)"
+ ],
+ "metadata": {
+ "id": "5NBF4KNlcxuQ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "readme_api_key = getpass(\"Enter ReadMe API key:\")"
+ ],
+ "metadata": {
+ "id": "RzJApX_P77x_"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from readmedocs_fetcher_haystack import ReadmeDocsFetcher\n",
+ "from haystack.nodes import EmbeddingRetriever, MarkdownConverter, PreProcessor\n",
+ "\n",
+ "converter = MarkdownConverter(remove_code_snippets=False)\n",
+ "readme_fetcher = ReadmeDocsFetcher(api_key=readme_api_key, markdown_converter=converter, base_url=\"https://docs.haystack.deepset.ai\")\n",
+ "embedder = EmbeddingRetriever(document_store=document_store, embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\")\n",
+ "preprocessor = PreProcessor()\n"
+ ],
+ "metadata": {
+ "id": "DAvf7RpV7u6U"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack import Pipeline\n",
+ "\n",
+ "indexing_pipeline = Pipeline()\n",
+ "indexing_pipeline.add_node(component=readme_fetcher, name=\"ReadmeFetcher\", inputs=[\"File\"])\n",
+ "indexing_pipeline.add_node(component=preprocessor, name=\"Preprocessor\", inputs=[\"ReadmeFetcher\"])\n",
+ "indexing_pipeline.add_node(component=embedder, name=\"Embedder\", inputs=[\"Preprocessor\"])\n",
+ "indexing_pipeline.add_node(component=document_store, name=\"DocumentStore\", inputs=[\"Embedder\"])\n",
+ "indexing_pipeline.run()"
+ ],
+ "metadata": {
+ "id": "peC-_2_23TYS"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.nodes import PromptNode, PromptTemplate, AnswerParser\n",
+ "\n",
+ "answer_with_references_prompt = PromptTemplate(prompt = \"You will be provided some conetent from technical documentation, where each paragraph is followed by the URL that it appears in. Answer the query based on the provided Documentation Content. Your answer should reference the URLs that it was generated from. Documentation Content: {join(documents, delimiter=new_line, pattern='---'+new_line+'$content'+new_line+'URL: $url', str_replace={new_line: ' ', '[': '(', ']': ')'})}\\nQuery: {query}\\nAnswer:\", output_parser=AnswerParser())"
+ ],
+ "metadata": {
+ "id": "gICaSTLS_C1_"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "api_key = getpass(\"Enter OpenAI API key:\")"
+ ],
+ "metadata": {
+ "id": "P_q-tY10G24C"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "prompt_node = PromptNode(model_name_or_path=\"gpt-4\", api_key=api_key, default_prompt_template=answer_with_references_prompt, max_length=500)"
+ ],
+ "metadata": {
+ "id": "y17ksGJBDGcg"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pipeline = Pipeline()\n",
+ "pipeline.add_node(component = embedder, name = \"Retriever\", inputs = [\"Query\"])\n",
+ "pipeline.add_node(component = prompt_node, name = \"GPT-4\", inputs=[\"Retriever\"])"
+ ],
+ "metadata": {
+ "id": "ExBKygl8HAZf"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def query(query:str):\n",
+ " result = pipeline.run(query, params = {\"Retriever\": {\"top_k\": 5}})\n",
+ " print(result['answers'][0].answer)\n",
+ " return result"
+ ],
+ "metadata": {
+ "id": "BExJVYLDHXME"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "result = query(\"What are the optional installations of Haystack?\")"
+ ],
+ "metadata": {
+ "id": "RsKByQGeHb1m"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(result['answers'][0].meta['prompt'])"
+ ],
+ "metadata": {
+ "id": "tEzawhenJCdv"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}