Created using Colaboratory

deepset-ai · TuanaCelik · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
commit 94f23b644ce2527649a4afa38772fe2de3c088e1
diff --git a/whisper-and-weaviate-for-youtube-rag.ipynb b/whisper-and-weaviate-for-youtube-rag.ipynb
@@ -0,0 +1,233 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyPETC6ys7VCsXtYbixbbLcI"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "📚 Check out the [**Talk to YouTube Videos with Haystack Pipelines**](https://haystack.deepset.ai/blog/talk-to-youtube-videos-with-haystack-pipelines) article for a detailed run through of this example."
+ ],
+ "metadata": {
+ "id": "AmtSVA32_MU_"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Install the Dependencies"
+ ],
+ "metadata": {
+ "id": "_KK7sSpi1SRB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install pytube\n",
+ "!pip install farm-haystack[weaviate,inference,file-conversion,preprocessing]"
+ ],
+ "metadata": {
+ "id": "4u3zEBrvoU11"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## (If Needed) Set Your API Token for desired the Model Provider"
+ ],
+ "metadata": {
+ "id": "EyKdppz31Wzo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from getpass import getpass\n",
+ "\n",
+ "api_key = getpass(\"Enter OpenAI API key:\")"
+ ],
+ "metadata": {
+ "id": "-yI1p6OUppgl"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## The Indexing Pipelne"
+ ],
+ "metadata": {
+ "id": "WgrdMlEL1hBr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import weaviate\n",
+ "from weaviate.embedded import EmbeddedOptions\n",
+ "\n",
+ "client = weaviate.Client(\n",
+ " embedded_options=weaviate.embedded.EmbeddedOptions()\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "_uTF0xJJQdRo"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.document_stores import WeaviateDocumentStore\n",
+ "\n",
+ "document_store = WeaviateDocumentStore(port=6666)"
+ ],
+ "metadata": {
+ "id": "_r0TSF1UQlk2"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zaiaiMszoR_l"
+ },
+ "outputs": [],
+ "source": [
+ "from pytube import YouTube\n",
+ "\n",
+ "def youtube2audio (url: str):\n",
+ " yt = YouTube(url)\n",
+ " video = yt.streams.filter(abr='160kbps').last()\n",
+ " return video.download()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.nodes import EmbeddingRetriever, PreProcessor\n",
+ "from haystack.nodes.audio import WhisperTranscriber\n",
+ "from haystack.pipelines import Pipeline\n",
+ "\n",
+ "preprocessor = PreProcessor()\n",
+ "embedder = EmbeddingRetriever(document_store=document_store, embedding_model=\"sentence-transformers/multi-qa-mpnet-base-dot-v1\")\n",
+ "whisper = WhisperTranscriber(api_key=api_key)\n",
+ "\n",
+ "indexing_pipeline = Pipeline()\n",
+ "indexing_pipeline.add_node(component=whisper, name=\"Whisper\", inputs=[\"File\"])\n",
+ "indexing_pipeline.add_node(component=preprocessor, name=\"Preprocessor\", inputs=[\"Whisper\"])\n",
+ "indexing_pipeline.add_node(component=embedder, name=\"Embedder\", inputs=[\"Preprocessor\"])\n",
+ "indexing_pipeline.add_node(component=document_store, name=\"DocumentStore\", inputs=[\"Embedder\"])"
+ ],
+ "metadata": {
+ "id": "XdQNWS2BQv8a"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Run the Indexing Pipeline"
+ ],
+ "metadata": {
+ "id": "0hGtJ7Q51nEI"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "videos = [\"https://www.youtube.com/watch?v=h5id4erwD4s\", \"https://www.youtube.com/watch?v=iFUeV3aYynI\"]\n",
+ "\n",
+ "for video in videos:\n",
+ " file_path = youtube2audio(video)\n",
+ " indexing_pipeline.run(file_paths=[file_path])\n"
+ ],
+ "metadata": {
+ "id": "y00-xWDdqZQx"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## The RAG Pipeline"
+ ],
+ "metadata": {
+ "id": "dq_FloEE1rNi"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from haystack.nodes import PromptNode, PromptTemplate, AnswerParser\n",
+ "\n",
+ "video_qa_prompt = PromptTemplate(prompt=\"You will be provided some transcripts from the AI Engineer livestream. Please answer the query based on what is said in the livestream.\\n\"\n",
+ " \"Video Transcripts: {join(documents)}\\n\"\n",
+ " \"Query: {query}\\n\"\n",
+ " \"Answer:\", output_parser = AnswerParser())\n",
+ "\n",
+ "prompt_node = PromptNode(model_name_or_path=\"gpt-4\", api_key=api_key, default_prompt_template=video_qa_prompt)"
+ ],
+ "metadata": {
+ "id": "oErh1UW2U0JO"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "video_rag_pipeline = Pipeline()\n",
+ "video_rag_pipeline.add_node(component=embedder, name=\"Retriever\", inputs=[\"Query\"])\n",
+ "video_rag_pipeline.add_node(component=prompt_node, name=\"PromptNode\", inputs=[\"Retriever\"])"
+ ],
+ "metadata": {
+ "id": "W6KPM_C-ZaXN"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Run the RAG Pipeline"
+ ],
+ "metadata": {
+ "id": "ddcgQZ881t9S"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "result = video_rag_pipeline.run(\"Why do we do chunking?\")\n",
+ "print(result['answers'][0].answer)"
+ ],
+ "metadata": {
+ "id": "QiynYi-yaEdI"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}