feat: add wiki_search

jabra · Sep 15, 2023 · 579b0fd · 579b0fd
1 parent 4d995e6
commit 579b0fd
Show file tree

Hide file tree

Showing 8 changed files with 1,191 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .pytest*
 __pycache__
+
+.streamlit
diff --git a/README.md b/README.md
@@ -2,11 +2,12 @@
 
 A couple of AI demo applications built with [Streamlit](https://streamlit.io/):
 
-[chat.py](./streamlit_examples/chat.py) - Let's the user upload PDF documents and chat with them using LlamaIndex. Supports multiple users and streaming.
+- [chat.py](./streamlit_examples/chat.py) - Let's the user upload PDF documents and chat with them using LlamaIndex. Supports multiple users and streaming.
+- [wiki_search.py](./streamlit_examples/wiki_search.py) - Semantic search over Wikipedia articles using [Weaviate](https://weaviate.io/). Search results are summarized using Cohere. Needs a [Cohere API Key](https://dashboard.cohere.com/api-keys).
 
 ## Getting Started
 
-This project is using poetry for dependency management. To install the dependencies, and setup the environment, run the following commands:
+This project uses poetry for dependency management. To install the dependencies and set up the environment, run the following commands:
 
 ```bash
 # poetry install
@@ -18,4 +19,3 @@ You can then run any of the examples by running:
 ```bash
 # streamlit run streamlit_examples/chat.py
 ```
-
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,8 @@ llama-index = "^0.8.24.post1"
 llama-cpp-python = "^0.1.84"
 streamlit = "^1.26.0"
 pypdf = "^3.16.0"
+cohere = "^4.26"
+weaviate-client = "^3.24.1"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/streamlit_examples/utils/cohere.py b/streamlit_examples/utils/cohere.py
@@ -0,0 +1,24 @@
+import cohere
+import streamlit as st
+
+cohere_api_key = st.secrets["cohere_api_key"]
+
+
+@st.cache_resource(show_spinner="Connecting to Cohere...")
+def connect_cohere():
+ return cohere.Client(cohere_api_key)
+
+
+def summarize(text: str) -> str:
+ if len(text) <= 250:
+ # Cohere's API requires at least 250 characters
+ return text
+ response = connect_cohere().summarize(
+ text=text,
+ length="auto",
+ format="auto",
+ model="command",
+ additional_command="",
+ temperature=0.8,
+ )
+ return response.summary
diff --git a/streamlit_examples/utils/weaviate.py b/streamlit_examples/utils/weaviate.py
@@ -0,0 +1,73 @@
+import streamlit as st
+import weaviate
+from streamlit_examples.utils.cohere import cohere_api_key
+
+
+@st.cache_resource(show_spinner="Connecting to Weaviate...")
+def connect_weaviate():
+ # Connect to the Weaviate demo database containing 10M wikipedia vectors
+ # This uses a public READ-ONLY Weaviate API key
+ auth_config = weaviate.auth.AuthApiKey(
+ api_key="76320a90-53d8-42bc-b41d-678647c6672e"
+ )
+ client = weaviate.Client(
+ url="https://cohere-demo.weaviate.network/",
+ auth_client_secret=auth_config,
+ additional_headers={
+ "X-Cohere-Api-Key": cohere_api_key,
+ },
+ )
+
+ client.is_ready()
+ return client
+
+
+def search_wikipedia(query, results_lang="en", limit=5):
+ """
+ Query the vectors database and return the top results.
+
+
+ Parameters
+ ----------
+ query: str
+ The search query
+
+ results_lang: str (optional)
+ Retrieve results only in the specified language.
+ The demo dataset has those languages:
+ en, de, fr, es, it, ja, ar, zh, ko, hi
+
+ """
+
+ client = connect_weaviate()
+
+ nearText = {"concepts": [query]}
+ properties = ["text", "title", "url", "views", "lang", "_additional {distance}"]
+
+ # To filter by language
+ if results_lang != "":
+ where_filter = {
+ "path": ["lang"],
+ "operator": "Equal",
+ "valueString": results_lang,
+ }
+ response = (
+ client.query.get("Articles", properties)
+ .with_where(where_filter)
+ .with_near_text(nearText)
+ .with_limit(limit)
+ .do()
+ )
+
+ # Search all languages
+ else:
+ response = (
+ client.query.get("Articles", properties)
+ .with_near_text(nearText)
+ .with_limit(limit)
+ .do()
+ )
+
+ result = response["data"]["Get"]["Articles"]
+
+ return result
diff --git a/streamlit_examples/utils/wikipedia.py b/streamlit_examples/utils/wikipedia.py
@@ -0,0 +1,29 @@
+from llama_index import Document
+
+
+def search_wiki(query, lang="en") -> list[Document]:
+ try:
+ import wikipedia
+ from wikipedia import PageError
+ except ImportError:
+ raise ImportError("Please install wikipedia: poetry add wikipedia")
+
+ wikipedia.set_lang(lang)
+ pages = wikipedia.search(query)
+ results = []
+ for page in pages:
+ try:
+ wiki_page = wikipedia.page(page, auto_suggest=False)
+ results.append(
+ Document(
+ text=wiki_page.content,
+ metadata={
+ "title": wiki_page.title,
+ "url": wiki_page.url,
+ "pageid": wiki_page.pageid,
+ },
+ )
+ )
+ except PageError:
+ pass
+ return results
diff --git a/streamlit_examples/wiki_search.py b/streamlit_examples/wiki_search.py
@@ -0,0 +1,39 @@
+import streamlit as st
+from streamlit_examples.utils.cohere import summarize
+from streamlit_examples.utils.weaviate import search_wikipedia
+
+
+def link(i, item):
+ return f"**[{i+1}. {item['title']}]({item['url']})**"
+
+
+st.title("Search Wikipedia")
+
+user_query = st.chat_input(placeholder="Backpacking in Asia")
+
+if not user_query:
+ st.info("Search Wikipedia and summarize the results. Type a query to start.")
+ st.stop()
+
+root = st.empty()
+with root.status("Querying vector store..."):
+ items = search_wikipedia(user_query, limit=3)
+container = root.container()
+container.write(f"That's what I found about: _{user_query}_")
+
+placeholders = []
+for i, item in enumerate(items):
+ placeholder = container.empty()
+ placeholder.info(f"{link(i,item)} {item['text']}")
+ placeholders.append(placeholder)
+
+status = container.status(
+ "Search results retrieved. I am summarizing the results for you. Meanwhile you can scroll up and have a look at the full text."
+)
+
+for i, item in enumerate(items):
+ with placeholders[i].status(f"_Summarizing_: {link(i,item)} {item['text']}"):
+ summary = summarize(item["text"])
+ placeholders[i].success(f"{link(i,item)} {summary}")
+
+status.update(label="Search finished. Try something else!", state="complete")