first commit

jabra · Sep 13, 2023 · 4d995e6 · 4d995e6
commit 4d995e6
Show file tree

Hide file tree

Showing 13 changed files with 2,773 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.pytest*
+__pycache__
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python: Streamlit",
+ "type": "python",
+ "request": "launch",
+ "module": "streamlit",
+ "args": [
+ "run",
+ "${file}",
+ ],
+ "justMyCode": true
+ }
+ ]
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,21 @@
+# streamlit-examples
+
+A couple of AI demo applications built with [Streamlit](https://streamlit.io/):
+
+[chat.py](./streamlit_examples/chat.py) - Let's the user upload PDF documents and chat with them using LlamaIndex. Supports multiple users and streaming.
+
+## Getting Started
+
+This project is using poetry for dependency management. To install the dependencies, and setup the environment, run the following commands:
+
+```bash
+# poetry install
+# poetry shell
+```
+
+You can then run any of the examples by running:
+
+```bash
+# streamlit run streamlit_examples/chat.py
+```
+
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,24 @@
+[tool.poetry]
+name = "streamlit-examples"
+version = "0.1.0"
+description = ""
+authors = ["Marcus Schiesser <[email protected]>"]
+readme = "README.md"
+packages = [{ include = "streamlit_examples" }]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+llama-index = "^0.8.24.post1"
+llama-cpp-python = "^0.1.84"
+streamlit = "^1.26.0"
+pypdf = "^3.16.0"
+
+
+[tool.poetry.group.dev.dependencies]
+black = "^23.9.1"
+autoflake = "^2.2.1"
+pytest = "^7.4.2"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/streamlit_examples/__init__.py b/streamlit_examples/__init__.py
diff --git a/streamlit_examples/chat.py b/streamlit_examples/chat.py
@@ -0,0 +1,51 @@
+import streamlit as st
+
+from llama_index import (
+ OpenAIEmbedding,
+ ServiceContext,
+ set_global_service_context,
+)
+from llama_index.llms import OpenAI
+from streamlit_examples.utils.llamaindex import build_index, handle_stream
+
+from streamlit_examples.utils.streamlit import (
+ get_key,
+ render_message,
+ upload_files,
+)
+
+st.title("Chat with Documents")
+
+openai_api_key = get_key()
+
+# Define service-context
+llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", api_key=openai_api_key)
+embed_model = OpenAIEmbedding(api_key=openai_api_key)
+service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
+set_global_service_context(service_context)
+
+# Upload PDFs
+pdfs = upload_files(type="pdf", accept_multiple_files=True)
+
+index = build_index(pdfs)
+query_engine = index.as_chat_engine(chat_mode="condense_question", streaming=True)
+
+messages = st.session_state.get("messages", [])
+
+if not messages:
+ messages.append({"role": "assistant", "text": "Hi!"})
+
+for message in messages:
+ render_message(message)
+
+if user_query := st.chat_input():
+ message = {"role": "user", "text": user_query}
+ messages.append(message)
+ render_message(message)
+
+ with st.chat_message("assistant"):
+ stream = query_engine.stream_chat(user_query)
+ text = handle_stream(st.empty(), stream)
+ message = {"role": "assistant", "text": text}
+ messages.append(message)
+ st.session_state.messages = messages
diff --git a/streamlit_examples/utils/__init__.py b/streamlit_examples/utils/__init__.py
diff --git a/streamlit_examples/utils/llamaindex.py b/streamlit_examples/utils/llamaindex.py
@@ -0,0 +1,20 @@
+from llama_index.chat_engine.types import StreamingAgentChatResponse
+import streamlit as st
+from llama_index import SimpleDirectoryReader, VectorStoreIndex
+
+
+# TODO: this is caching the resource globally, not per-session
+# Each user session should have their own index
+@st.cache_resource(show_spinner="Indexing documents...")
+def build_index(files):
+ documents = SimpleDirectoryReader(input_files=files).load_data()
+ return VectorStoreIndex.from_documents(documents)
+
+
+def handle_stream(root, stream: StreamingAgentChatResponse):
+ text = ""
+ root.markdown("Thinking...")
+ for token in stream.response_gen:
+ text += token
+ root.markdown(text)
+ return text
diff --git a/streamlit_examples/utils/streamlit.py b/streamlit_examples/utils/streamlit.py
@@ -0,0 +1,40 @@
+import os
+import streamlit as st
+
+CACHE_DIR = "./uploads"
+
+
+def render_message(message):
+ with st.chat_message(message["role"]):
+ st.write(message["text"])
+
+
+def get_key():
+ if "openai_api_key" not in st.session_state:
+ openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
+ if not openai_api_key:
+ st.info("Please add your OpenAI API key to continue.")
+ st.stop()
+ st.session_state["openai_api_key"] = openai_api_key
+ return st.session_state["openai_api_key"]
+
+
+def upload_files(type="pdf", **kwargs):
+ files = st.sidebar.file_uploader(
+ label=f"Upload {type.upper()} files", type=[type], **kwargs
+ )
+ if not files:
+ st.info(f"Please add {type.upper()} documents")
+ st.stop()
+ return cache_files(files, type=type)
+
+
+def cache_files(files, type="pdf") -> list[str]:
+ filepaths = []
+ for file in files:
+ filepath = f"{CACHE_DIR}/{file.file_id}.{type}"
+ if not os.path.exists(filepath):
+ with open(filepath, "wb") as f:
+ f.write(file.getbuffer())
+ filepaths.append(filepath)
+ return filepaths
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,42 @@
+import os
+from streamlit_examples.utils.streamlit import cache_files
+from streamlit.runtime.uploaded_file_manager import (
+ UploadedFile,
+ UploadedFileRec,
+ FileURLsProto,
+)
+
+
+def create_file(name, test_data):
+ file_id = f"{name}_file_id"
+ type = "text/plain"
+ record = UploadedFileRec(file_id=file_id, name=name, type=type, data=test_data)
+ file_urls = FileURLsProto()
+ return UploadedFile(record=record, file_urls=file_urls)
+
+
+def test_cache_file():
+ tc = [
+ {
+ "test": "test one file",
+ "files": [create_file("test.pdf", b"test content")],
+ "expected": [b"test content"],
+ },
+ {
+ "test": "test two files",
+ "files": [
+ create_file("test.pdf", b"test content"),
+ create_file("test2.pdf", b"test content 2"),
+ ],
+ "expected": [b"test content", b"test content 2"],
+ },
+ ]
+
+ for test in tc:
+ filepaths = cache_files(test["files"])
+ assert len(filepaths) == len(test["files"])
+ for i in range(len(filepaths)):
+ assert os.path.exists(filepaths[i])
+ with open(filepaths[i], "rb") as f:
+ assert f.read() == test["expected"][i]
+ os.remove(filepaths[i])
diff --git a/uploads/.gitignore b/uploads/.gitignore
@@ -0,0 +1 @@
+*.pdf