Skip to content

Commit

Permalink
feat: added pdf summarizer
Browse files Browse the repository at this point in the history
  • Loading branch information
marcusschiesser committed Sep 26, 2023
1 parent 64a5ccc commit f40c803
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 8 deletions.
5 changes: 3 additions & 2 deletions streamlit_examples/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
from streamlit_examples.utils.llamaindex import build_index, handle_stream

from streamlit_examples.utils.streamlit import (
get_key,
cache_files,
render_message,
upload_files,
)

st.title("Chat with Documents")

openai_api_key = get_key()
openai_api_key = st.secrets["OPENAI_API_KEY"]

# Define service-context
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", api_key=openai_api_key)
Expand All @@ -26,6 +26,7 @@

# Upload PDFs
pdfs = upload_files(type="pdf", accept_multiple_files=True)
pdfs = cache_files(pdfs, type="pdf")

index = build_index(pdfs)
query_engine = index.as_chat_engine(chat_mode="condense_question", streaming=True)
Expand Down
42 changes: 42 additions & 0 deletions streamlit_examples/snowflake_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import streamlit as st
import openai
from llama_index import (
OpenAIEmbedding,
ServiceContext,
SimpleDirectoryReader,
VectorStoreIndex,
set_global_service_context,
)
from llama_index.llms import OpenAI

from streamlit_examples.utils.streamlit import (
cache_file,
upload_files,
)

st.title("☃️ PDF2Snowflake")
st.write("Summarizes PDFs and stores them with their summary in Snowflake.")

openai.api_key = st.secrets["OPENAI_API_KEY"]

# Define service-context
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")
embed_model = OpenAIEmbedding()
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
set_global_service_context(service_context)

# Upload PDFs
pdfs = upload_files(type="pdf", accept_multiple_files=True)


# Summarize each PDF
for pdf in pdfs:
file = cache_file(pdf, type="pdf")
with st.spinner(f"Indexing '{pdf.name}'..."):
documents = SimpleDirectoryReader(input_files=[file]).load_data()
index = VectorStoreIndex.from_documents(documents)
with st.spinner(f"Summarize '{pdf.name}'..."):
query_engine = index.as_query_engine()
summary = query_engine.query("What is a summary of this document?")
st.markdown(f"## Summary of **{pdf.name}**")
st.markdown(summary)
17 changes: 11 additions & 6 deletions streamlit_examples/utils/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,26 @@ def get_key():


def upload_files(type="pdf", **kwargs):
files = st.sidebar.file_uploader(
files = st.file_uploader(
label=f"Upload {type.upper()} files", type=[type], **kwargs
)
if not files:
st.info(f"Please add {type.upper()} documents")
st.stop()
return cache_files(files, type=type)
return files


def cache_files(files, type="pdf") -> list[str]:
filepaths = []
for file in files:
filepath = f"{CACHE_DIR}/{file.file_id}.{type}"
if not os.path.exists(filepath):
with open(filepath, "wb") as f:
f.write(file.getbuffer())
filepath = cache_file(file, type=type)
filepaths.append(filepath)
return filepaths


def cache_file(file, type="pdf") -> str:
filepath = f"{CACHE_DIR}/{file.file_id}.{type}"
if not os.path.exists(filepath):
with open(filepath, "wb") as f:
f.write(file.getbuffer())
return filepath

0 comments on commit f40c803

Please sign in to comment.