Pushing enterprise knowledge retrieval to cookbook

orasept77 · May 11, 2023 · a3918a9 · a3918a9
1 parent 7d418b9
commit a3918a9
Show file tree

Hide file tree

Showing 9 changed files with 62,559 additions and 0 deletions.
diff --git a/apps/enterprise-knowledge-retrieval/README.md b/apps/enterprise-knowledge-retrieval/README.md
@@ -0,0 +1,36 @@
+# Enterprise Knowledge Retrieval
+
+This repo is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
+
+This repo contains a notebook and a basic Streamlit app:
+- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance
+- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
+
+To run the app, please follow the instructions below in the ```App``` section
+
+## Notebook
+
+The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
+- **Setup:** Initiate variables and connect to a vector database.
+- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
+- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
+- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
+- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
+
+Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
+
+## App
+
+We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or Hyde retrievals.
+
+You can use it by:
+- Ensuring you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
+- Setting up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
+- Activate the environment by running ```source venv/bin/activate```.
+- Install requirements by running ```pip install -r requirements.txt```.
+- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser
+
+## Limitations
+
+- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
+- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in separate offerings in the coming weeks.
diff --git a/apps/enterprise-knowledge-retrieval/assistant.py b/apps/enterprise-knowledge-retrieval/assistant.py
@@ -0,0 +1,169 @@
+from langchain.agents import (
+ Tool,
+ AgentExecutor,
+ LLMSingleActionAgent,
+ AgentOutputParser,
+)
+from langchain.prompts import BaseChatPromptTemplate
+from langchain import SerpAPIWrapper, LLMChain
+from langchain.chat_models import ChatOpenAI
+from typing import List, Union
+from langchain.schema import AgentAction, AgentFinish, HumanMessage
+from langchain.memory import ConversationBufferWindowMemory
+import openai
+import re
+import streamlit as st
+
+from database import get_redis_results, get_redis_connection
+from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
+
+
+redis_client = get_redis_connection()
+
+
+def answer_user_question(query):
+
+ results = get_redis_results(redis_client, query, INDEX_NAME)
+
+ results.to_csv("results.csv")
+
+ search_content = ""
+ for x, y in results.head(3).iterrows():
+ search_content += y["title"] + "\n" + y["result"] + "\n\n"
+
+ retrieval_prepped = RETRIEVAL_PROMPT.format(
+ SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
+ )
+
+ retrieval = openai.ChatCompletion.create(
+ model=CHAT_MODEL,
+ messages=[{"role": "user", "content": retrieval_prepped}],
+ max_tokens=500,
+ )
+
+ # Response provided by GPT-3.5
+ return retrieval["choices"][0]["message"]["content"]
+
+
+def answer_question_hyde(query):
+
+ hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
+ Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
+
+ User question: {USER_QUESTION_HERE}
+ 
+ Answer:"""
+
+ hypothetical_answer = openai.ChatCompletion.create(
+ model=CHAT_MODEL,
+ messages=[
+ {
+ "role": "user",
+ "content": hyde_prompt.format(USER_QUESTION_HERE=query),
+ }
+ ],
+ )["choices"][0]["message"]["content"]
+ # st.write(hypothetical_answer)
+ results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
+
+ results.to_csv("results.csv")
+
+ search_content = ""
+ for x, y in results.head(3).iterrows():
+ search_content += y["title"] + "\n" + y["result"] + "\n\n"
+
+ retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
+ "SEARCH_CONTENT_HERE", search_content
+ )
+ retrieval = openai.ChatCompletion.create(
+ model=CHAT_MODEL,
+ messages=[{"role": "user", "content": retrieval_prepped}],
+ max_tokens=500,
+ )
+
+ return retrieval["choices"][0]["message"]["content"]
+
+
+# Set up a prompt template
+class CustomPromptTemplate(BaseChatPromptTemplate):
+ # The template to use
+ template: str
+ # The list of tools available
+ tools: List[Tool]
+
+ def format_messages(self, **kwargs) -> str:
+ # Get the intermediate steps (AgentAction, Observation tuples)
+ # Format them in a particular way
+ intermediate_steps = kwargs.pop("intermediate_steps")
+ thoughts = ""
+ for action, observation in intermediate_steps:
+ thoughts += action.log
+ thoughts += f"\nObservation: {observation}\nThought: "
+ # Set the agent_scratchpad variable to that value
+ kwargs["agent_scratchpad"] = thoughts
+ # Create a tools variable from the list of tools provided
+ kwargs["tools"] = "\n".join(
+ [f"{tool.name}: {tool.description}" for tool in self.tools]
+ )
+ # Create a list of tool names for the tools provided
+ kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
+ formatted = self.template.format(**kwargs)
+ return [HumanMessage(content=formatted)]
+
+
+class CustomOutputParser(AgentOutputParser):
+ def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
+ # Check if agent should finish
+ if "Final Answer:" in llm_output:
+ return AgentFinish(
+ # Return values is generally always a dictionary with a single `output` key
+ # It is not recommended to try anything else at the moment :)
+ return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
+ log=llm_output,
+ )
+ # Parse out the action and action input
+ regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
+ match = re.search(regex, llm_output, re.DOTALL)
+ if not match:
+ raise ValueError(f"Could not parse LLM output: `{llm_output}`")
+ action = match.group(1).strip()
+ action_input = match.group(2)
+ # Return the action and action input
+ return AgentAction(
+ tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
+ )
+
+
+def initiate_agent(tools):
+ prompt = CustomPromptTemplate(
+ template=SYSTEM_PROMPT,
+ tools=tools,
+ # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
+ # The history template includes "history" as an input variable so we can interpolate it into the prompt
+ input_variables=["input", "intermediate_steps", "history"],
+ )
+
+ # Initiate the memory with k=2 to keep the last two turns
+ # Provide the memory to the agent
+ memory = ConversationBufferWindowMemory(k=2)
+
+ output_parser = CustomOutputParser()
+
+ llm = ChatOpenAI(temperature=0)
+
+ # LLM chain consisting of the LLM and a prompt
+ llm_chain = LLMChain(llm=llm, prompt=prompt)
+
+ tool_names = [tool.name for tool in tools]
+ agent = LLMSingleActionAgent(
+ llm_chain=llm_chain,
+ output_parser=output_parser,
+ stop=["\nObservation:"],
+ allowed_tools=tool_names,
+ )
+
+ agent_executor = AgentExecutor.from_agent_and_tools(
+ agent=agent, tools=tools, verbose=True, memory=memory
+ )
+
+ return agent_executor
diff --git a/apps/enterprise-knowledge-retrieval/chatbot.py b/apps/enterprise-knowledge-retrieval/chatbot.py
@@ -0,0 +1,76 @@
+from langchain.agents import Tool
+import pandas as pd
+import streamlit as st
+from streamlit_chat import message
+
+from database import get_redis_connection
+from assistant import answer_user_question, initiate_agent, answer_question_hyde
+
+# Initialise database
+
+## Initialise Redis connection
+redis_client = get_redis_connection()
+
+
+### CHATBOT APP
+
+# --- GENERAL SETTINGS ---
+PAGE_TITLE: str = "Knowledge Retrieval Bot"
+PAGE_ICON: str = "🤖"
+
+st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
+
+st.title("Wiki Chatbot")
+st.subheader("Learn things - random things!")
+
+# Using object notation
+add_selectbox = st.sidebar.selectbox(
+ "What kind of search?", ("Standard vector search", "HyDE")
+)
+
+# Define which tools the agent can use to answer user queries
+tools = [
+ Tool(
+ name="Search",
+ func=answer_user_question
+ if add_selectbox == "Standard vector search"
+ else answer_question_hyde,
+ description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
+ )
+]
+
+if "generated" not in st.session_state:
+ st.session_state["generated"] = []
+
+if "past" not in st.session_state:
+ st.session_state["past"] = []
+
+
+def query(question):
+ response = st.session_state["chat"].ask_assistant(question)
+ return response
+
+
+prompt = st.text_input("What do you want to know: ", "", key="input")
+
+if st.button("Submit", key="generationSubmit"):
+ with st.spinner("Thinking..."):
+ # Initialization
+ if "agent" not in st.session_state:
+ st.session_state["agent"] = initiate_agent(tools)
+
+ response = st.session_state["agent"].run(prompt)
+
+ st.session_state.past.append(prompt)
+ st.session_state.generated.append(response)
+
+if len(st.session_state["generated"]) > 0:
+ for i in range(len(st.session_state["generated"]) - 1, -1, -1):
+ message(st.session_state["generated"][i], key=str(i))
+ message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
+
+ with st.expander("See search results"):
+
+ results = list(pd.read_csv("results.csv")["result"])
+
+ st.write(results)
diff --git a/apps/enterprise-knowledge-retrieval/config.py b/apps/enterprise-knowledge-retrieval/config.py
@@ -0,0 +1,46 @@
+REDIS_HOST = "localhost"
+REDIS_PORT = "6380"
+REDIS_DB = "0"
+INDEX_NAME = "wiki-index"
+VECTOR_FIELD_NAME = "content_vector"
+CHAT_MODEL = "gpt-3.5-turbo"
+EMBEDDINGS_MODEL = "text-embedding-ada-002"
+# Set up the base template
+SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
+Accept the first answer that you are provided for the user.
+You have access to the following tools::
+
+{tools}
+
+Use the following format:
+
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can repeat N times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+
+Begin! Remember to give detailed, informative answers
+
+Previous conversation history:
+{history}
+
+New question: {input}
+{agent_scratchpad}"""
+# Build a prompt to provide the original query, the result and ask to summarise for the user
+RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
+If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
+
+Search query: 
+
+{SEARCH_QUERY_HERE}
+
+Content: 
+
+{SEARCH_CONTENT_HERE}
+
+Answer:
+"""