Skip to content

Commit

Permalink
feat: add wiki_search
Browse files Browse the repository at this point in the history
  • Loading branch information
marcusschiesser committed Sep 15, 2023
1 parent 4d995e6 commit 579b0fd
Show file tree
Hide file tree
Showing 8 changed files with 1,191 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.pytest*
__pycache__

.streamlit
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

A couple of AI demo applications built with [Streamlit](https://streamlit.io/):

[chat.py](./streamlit_examples/chat.py) - Let's the user upload PDF documents and chat with them using LlamaIndex. Supports multiple users and streaming.
- [chat.py](./streamlit_examples/chat.py) - Let's the user upload PDF documents and chat with them using LlamaIndex. Supports multiple users and streaming.
- [wiki_search.py](./streamlit_examples/wiki_search.py) - Semantic search over Wikipedia articles using [Weaviate](https://weaviate.io/). Search results are summarized using Cohere. Needs a [Cohere API Key](https://dashboard.cohere.com/api-keys).

## Getting Started

This project is using poetry for dependency management. To install the dependencies, and setup the environment, run the following commands:
This project uses poetry for dependency management. To install the dependencies and set up the environment, run the following commands:

```bash
# poetry install
Expand All @@ -18,4 +19,3 @@ You can then run any of the examples by running:
```bash
# streamlit run streamlit_examples/chat.py
```

1,020 changes: 1,019 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ llama-index = "^0.8.24.post1"
llama-cpp-python = "^0.1.84"
streamlit = "^1.26.0"
pypdf = "^3.16.0"
cohere = "^4.26"
weaviate-client = "^3.24.1"


[tool.poetry.group.dev.dependencies]
Expand Down
24 changes: 24 additions & 0 deletions streamlit_examples/utils/cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import cohere
import streamlit as st

cohere_api_key = st.secrets["cohere_api_key"]


@st.cache_resource(show_spinner="Connecting to Cohere...")
def connect_cohere():
return cohere.Client(cohere_api_key)


def summarize(text: str) -> str:
if len(text) <= 250:
# Cohere's API requires at least 250 characters
return text
response = connect_cohere().summarize(
text=text,
length="auto",
format="auto",
model="command",
additional_command="",
temperature=0.8,
)
return response.summary
73 changes: 73 additions & 0 deletions streamlit_examples/utils/weaviate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import streamlit as st
import weaviate
from streamlit_examples.utils.cohere import cohere_api_key


@st.cache_resource(show_spinner="Connecting to Weaviate...")
def connect_weaviate():
# Connect to the Weaviate demo database containing 10M wikipedia vectors
# This uses a public READ-ONLY Weaviate API key
auth_config = weaviate.auth.AuthApiKey(
api_key="76320a90-53d8-42bc-b41d-678647c6672e"
)
client = weaviate.Client(
url="https://cohere-demo.weaviate.network/",
auth_client_secret=auth_config,
additional_headers={
"X-Cohere-Api-Key": cohere_api_key,
},
)

client.is_ready()
return client


def search_wikipedia(query, results_lang="en", limit=5):
"""
Query the vectors database and return the top results.
Parameters
----------
query: str
The search query
results_lang: str (optional)
Retrieve results only in the specified language.
The demo dataset has those languages:
en, de, fr, es, it, ja, ar, zh, ko, hi
"""

client = connect_weaviate()

nearText = {"concepts": [query]}
properties = ["text", "title", "url", "views", "lang", "_additional {distance}"]

# To filter by language
if results_lang != "":
where_filter = {
"path": ["lang"],
"operator": "Equal",
"valueString": results_lang,
}
response = (
client.query.get("Articles", properties)
.with_where(where_filter)
.with_near_text(nearText)
.with_limit(limit)
.do()
)

# Search all languages
else:
response = (
client.query.get("Articles", properties)
.with_near_text(nearText)
.with_limit(limit)
.do()
)

result = response["data"]["Get"]["Articles"]

return result
29 changes: 29 additions & 0 deletions streamlit_examples/utils/wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from llama_index import Document


def search_wiki(query, lang="en") -> list[Document]:
try:
import wikipedia
from wikipedia import PageError
except ImportError:
raise ImportError("Please install wikipedia: poetry add wikipedia")

wikipedia.set_lang(lang)
pages = wikipedia.search(query)
results = []
for page in pages:
try:
wiki_page = wikipedia.page(page, auto_suggest=False)
results.append(
Document(
text=wiki_page.content,
metadata={
"title": wiki_page.title,
"url": wiki_page.url,
"pageid": wiki_page.pageid,
},
)
)
except PageError:
pass
return results
39 changes: 39 additions & 0 deletions streamlit_examples/wiki_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import streamlit as st
from streamlit_examples.utils.cohere import summarize
from streamlit_examples.utils.weaviate import search_wikipedia


def link(i, item):
return f"**[{i+1}. {item['title']}]({item['url']})**"


st.title("Search Wikipedia")

user_query = st.chat_input(placeholder="Backpacking in Asia")

if not user_query:
st.info("Search Wikipedia and summarize the results. Type a query to start.")
st.stop()

root = st.empty()
with root.status("Querying vector store..."):
items = search_wikipedia(user_query, limit=3)
container = root.container()
container.write(f"That's what I found about: _{user_query}_")

placeholders = []
for i, item in enumerate(items):
placeholder = container.empty()
placeholder.info(f"{link(i,item)} {item['text']}")
placeholders.append(placeholder)

status = container.status(
"Search results retrieved. I am summarizing the results for you. Meanwhile you can scroll up and have a look at the full text."
)

for i, item in enumerate(items):
with placeholders[i].status(f"_Summarizing_: {link(i,item)} {item['text']}"):
summary = summarize(item["text"])
placeholders[i].success(f"{link(i,item)} {summary}")

status.update(label="Search finished. Try something else!", state="complete")

0 comments on commit 579b0fd

Please sign in to comment.