diff --git a/misc/using_citations.ipynb b/misc/using_citations.ipynb new file mode 100644 index 0000000..7ef7456 --- /dev/null +++ b/misc/using_citations.ipynb @@ -0,0 +1,544 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Having Claude Cite its Sources\n", + "\n", + "This guide teach you how to have Claude cite sources when referencing documents given to it in context. Adding citations can make your application more reliable and transparent -- when a user can click through to a piece of supporting \n", + "documentation, it will help them develop trust in generated outputs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🛠️ Setup\n", + "First, let's install the required libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install requirements\n", + "%pip install requests beautifulsoup4 anthropic lxml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the Anthropic Documentation\n", + "For this example, you'll test Claude's ability to answer questions about its own documentation. Let's start by writing a function to download everything at [docs.anthropic.com](docs.anthropic.com). \n", + "\n", + "To get started, we'll define a function to download the text content from a website's sitemap." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from urllib.parse import urljoin\n", + "\n", + "def download_sitemap_text(sitemap_url):\n", + " # Download the sitemap content\n", + " sitemap_response = requests.get(sitemap_url)\n", + " sitemap_content = sitemap_response.text\n", + "\n", + " # Parse the sitemap XML\n", + " soup = BeautifulSoup(sitemap_content, 'lxml-xml')\n", + "\n", + " # Extract the URLs from the sitemap\n", + " urls = [loc.text for loc in soup.find_all('loc')]\n", + "\n", + " # Keep track of unique titles and content\n", + " unique_titles = set()\n", + " unique_content = set()\n", + "\n", + " # Download the text content of each page\n", + " page_data = []\n", + " for url in urls:\n", + " try:\n", + " # Download the page content\n", + " page_response = requests.get(url)\n", + " page_content = page_response.text\n", + "\n", + " # Parse the HTML content\n", + " soup = BeautifulSoup(page_content, 'html.parser')\n", + "\n", + " # Extract the title\n", + " title = soup.title.text.strip() if soup.title else ''\n", + "\n", + " # Extract the text content while preserving newlines\n", + " text_elements = soup.select('p, code')\n", + " text = '\\n'.join([elem.get_text(strip=False) for elem in text_elements])\n", + "\n", + " # Check if the title or content is unique\n", + " if title in unique_titles or text in unique_content:\n", + " # Skip duplicates\n", + " continue\n", + " unique_titles.add(title)\n", + " unique_content.add(text)\n", + "\n", + " # Create a dictionary with the page data\n", + " page_dict = {\n", + " 'title': title,\n", + " 'url': url,\n", + " 'content': text\n", + " }\n", + "\n", + " page_data.append(page_dict)\n", + " except requests.exceptions.RequestException as e:\n", + " print(f\"Error downloading {url}: {e}\")\n", + " continue\n", + "\n", + " return page_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we'll download the Anthropic documentation, including the url, title, and content of each page:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome to Claude ✅\n" + ] + } + ], + "source": [ + "# Download the docs\n", + "anthropic_docs_sitemap_url = 'https://docs.anthropic.com/sitemap.xml'\n", + "text_content = download_sitemap_text(anthropic_docs_sitemap_url)\n", + "print(text_content[0]['title'] + \" ✅\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up the Anthropic client\n", + "\n", + "Now we'll initialize the Anthropic client using your API key:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from anthropic import Anthropic\n", + "client = Anthropic()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a Question-Answering Prompt with Citations\n", + "\n", + "Now we'll work on constructing a prompt that asks Claude to answer user questions about Anthropics documentation while citing its sources. We'll start with the System prompt -- let's look at the prompt first, then break down the relevant sections:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Format the docs into a useful format for Claude\n", + "# Each page is placed into XML: Page content\n", + "website_content_string = \"\"\n", + "for page in text_content:\n", + " website_content_string += f'\\n{page['content']}\\n\\n'\n", + "\n", + "SYSTEM_PROMPT = f\"\"\"You are Anthropic's DocBot, a helpful assistant that is an expert at helping users with the Anthropic documentation.\n", + "\n", + "Here is the Anthropic documentation:\n", + "\n", + "{website_content_string}\n", + "\n", + "\n", + "When a user asks a question, perform the following tasks:\n", + "1. Find the quotes from the documentation that are the most relevant to answering the question. These quotes can be quite long if necessary (even multiple paragraphs). You may need to use many quotes to answer a single question, including code snippits and other examples.\n", + "2. Assign numbers to these quotes in the order they were found. Each page of the documentation should only be assigned a number once.\n", + "3. Based on the document and quotes, answer the question. Directly quote the documentation when possible, including examples. When relevant, code examples are preferred.\n", + "4. When answering the question provide citations references in square brackets containing the number generated in step 2 (the number the citation was found)\n", + "5. Structure the output in the following format:\n", + "\n", + "{{\n", + " \"citations\": [\n", + " {{\n", + " \n", + " \"page_title\": \"string\",\n", + " \"url: \"string\",\n", + " \"number\": \"integer\",\n", + " \"relevant_passages\": [\"string\"] // A list of every relevant passage on a single documentation page\n", + " }},\n", + " ...\n", + " ]\n", + "}}\n", + "\n", + "\n", + "A plain text answer, formatted as Markdown[1]\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Breaking down the system prompt\n", + "\n", + "#### First, we'll format the content of the documentation into a list of documents, in an XML format that Claude will understand well.\n", + "```\n", + "website_content_string = \"\"\n", + "for page in text_content:\n", + " website_content_string += f'\\n{page['content']}\\n\\n'\n", + "```\n", + "#### Assign Claude the role of \"DocBot\", and give it the Anthropic documentation for reference:\n", + "```\n", + "You are Anthropic's DocBot, a helpful assistant that is an expert at helping users with the Anthropic documentation.\n", + "\n", + "Here is the Anthropic documentation:\n", + "\n", + "{website_content_string}\n", + "\n", + "```\n", + "\n", + "#### Clearly define how Claude should create citations by finding relevant quotes, numbering pages, and then answering the question.\n", + "```\n", + "When a user asks a question, perform the following tasks:\n", + "1. Find the quotes from the documentation that are the most relevant to answering the question. These quotes can be quite long if necessary (even multiple paragraphs). You may need to use many quotes to answer a single question, including code snippits and other examples.\n", + "2. Assign numbers to these quotes in the order they were found. Each page of the documentation should only be assigned a number once.\n", + "3. Based on the document and quotes, answer the question. Directly quote the documentation when possible, including examples.\n", + "4. When answering the question provide citations references in square brackets containing the number generated in step 2 (the number the citation was found)\n", + "```\n", + "\n", + "#### Give a prescriptive description of the output format of the citations to ensure we capture all of the relevant citation information, and encourage the model to extract the types of information that would be helpful for answering the question.\n", + "```\n", + "5. Structure the output in the following format:\n", + "\n", + "{\n", + " \"citations\": [\n", + " {\n", + " \n", + " \"page_title\": \"string\",\n", + " \"url: \"string\",\n", + " \"number\": \"integer\",\n", + " \"relevant_passages\": [\"string\"] // A list of every relevant passage on a single documentation page\n", + " },\n", + " ...\n", + " ]\n", + "}\n", + "\n", + "```\n", + "\n", + "#### Ask Claude to output the answer in XML tags for easier parsing, and give it an example of a citation:\n", + "\n", + "```\n", + "A plain text answer[1]\"\"\"\n", + "```\n", + "\n", + "**Note:** Having Claude put the citations before the answer will make it much more likely that Claude directly quotes from the cited material!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The User Prompt\n", + "\n", + "Ask Claude a qusetion about the docs!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# ideas = [\n", + "# \"How do I use a system prompt?\",\n", + "# \"how to output json?\",\n", + "# \"How do I use an image in python api\",\n", + "# \"How do I use messages in sheets?\"\n", + "# ]\n", + "\n", + "QUERY = \"How do I use an image in python api\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Send it to Claude!" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "starter_stub = '\\n{\\n \"citations\": ['\n", + "\n", + "response = client.messages.create(\n", + " model=\"claude-3-haiku-20240307\",\n", + " max_tokens=1024,\n", + " system=SYSTEM_PROMPT,\n", + " temperature=0.0,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": QUERY\n", + " },\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": starter_stub\n", + " }\n", + " ]\n", + ")\n", + "\n", + "full_response = starter_stub + response.content[0].text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the raw output:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "{\n", + " \"citations\": [\n", + " {\n", + " \"page_title\": \"Vision\",\n", + " \"url\": \"https://docs.anthropic.com/claude/docs/vision\",\n", + " \"number\": 1,\n", + " \"relevant_passages\": [\n", + " \"Currently, you can utilize Claude's vision capabilities in three ways:\\n\\nUser\\nFor this guide, we'll be using the Anthropic Python SDK, and the following example variables. We'll fetch sample images from Wikipedia using the httpx library, but you can use whatever image sources work for you.\",\n", + " \"To utilize images when making an API request, you can provide images to Claude as a base64-encoded image in image content blocks. Here is simple example in Python showing how to include a base64-encoded image in a Messages API request:\"\n", + " ]\n", + " }\n", + " ]\n", + "}\n", + "\n", + "\n", + "To use an image in the Python API, you can follow these steps:\n", + "\n", + "1. Fetch the image you want to use, for example from Wikipedia, and convert it to a base64-encoded string[1]:\n", + "\n", + "```python\n", + "import anthropic\n", + "import base64\n", + "import httpx\n", + "\n", + "image_url = \"https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg\"\n", + "image_media_type = \"image/jpeg\"\n", + "image_data = base64.b64encode(httpx.get(image_url).content).decode(\"utf-8\")\n", + "```\n", + "\n", + "2. Then, include the base64-encoded image in the `messages` parameter when creating a new message using the Anthropic Python SDK[1]:\n", + "\n", + "```python\n", + "message = anthropic.Anthropic().messages.create(\n", + " model=\"claude-3-opus-20240229\",\n", + " max_tokens=1024,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"source\": {\n", + " \"type\": \"base64\",\n", + " \"media_type\": image_media_type,\n", + " \"data\": image_data,\n", + " },\n", + " }\n", + " ],\n", + " }\n", + " ],\n", + ")\n", + "```\n", + "\n", + "This will send the base64-encoded image to the API, allowing Claude to process and analyze the image as part of the request.\n", + "\n" + ] + } + ], + "source": [ + "print(full_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse and display the results\n", + "\n", + "Now let's pull out the structured content from the result and display the answer as a user would see it." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "To use an image in the Python API, you can follow these steps:\n", + "\n", + "1. Fetch the image you want to use, for example from Wikipedia, and convert it to a base64-encoded string[\\[1\\]](https://docs.anthropic.com/claude/docs/vision):\n", + "\n", + "```python\n", + "import anthropic\n", + "import base64\n", + "import httpx\n", + "\n", + "image_url = \"https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg\"\n", + "image_media_type = \"image/jpeg\"\n", + "image_data = base64.b64encode(httpx.get(image_url).content).decode(\"utf-8\")\n", + "```\n", + "\n", + "2. Then, include the base64-encoded image in the `messages` parameter when creating a new message using the Anthropic Python SDK[\\[1\\]](https://docs.anthropic.com/claude/docs/vision):\n", + "\n", + "```python\n", + "message = anthropic.Anthropic().messages.create(\n", + " model=\"claude-3-opus-20240229\",\n", + " max_tokens=1024,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"source\": {\n", + " \"type\": \"base64\",\n", + " \"media_type\": image_media_type,\n", + " \"data\": image_data,\n", + " },\n", + " }\n", + " ],\n", + " }\n", + " ],\n", + ")\n", + "```\n", + "\n", + "This will send the base64-encoded image to the API, allowing Claude to process and analyze the image as part of the request.\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, Markdown\n", + "import json\n", + "import re\n", + "\n", + "def get_answer(completion):\n", + " # Regex to extract answer from xml tags\n", + " answer = re.search(r'(.*?)', completion, re.DOTALL)\n", + " if answer is None:\n", + " return \"\"\n", + " return answer.group(1)\n", + "\n", + "def get_citations(completion):\n", + " # Regex to extract citations from xml tags\n", + " citations = re.search(r'(.*?)', completion, re.DOTALL)\n", + " if citations is None:\n", + " return \"{}\"\n", + " return json.loads(citations.group(1))\n", + "\n", + "def render_response(full_response):\n", + " answer = get_answer(full_response)\n", + " citations = get_citations(full_response)\n", + " citations_list = citations['citations']\n", + " \n", + " # replace [number] in the answer with a link to the relevant citation url\n", + " for citation in citations_list:\n", + " answer = answer.replace(f\"[{citation['number']}]\", f\"[\\\\[{citation['number']}\\\\]]({citation['url']})\")\n", + "\n", + " display(Markdown(answer))\n", + "\n", + "render_response(full_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wrap Up!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! To recap:\n", + "* Citations can help ground the Claude's responses in provided data, and can give additional context to a user\n", + "* To get Claude to generate citations:\n", + " * Be clear about the structure of the citations\n", + " * Ask it to create the citations first, and importantly to pull out specific quotes\n", + " * Use structured sections to separate the citations from the answer\n", + " * [Prefill Claude's response](https://docs.anthropic.com/claude/docs/prefill-claudes-response) to ensure it starts its response with citations\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}