Merge pull request openai#206 from openai/getting_started_qanda_chatbot

Chatbot starter app used in conference
nickduran · Mar 21, 2023 · 8a5b495 · 8a5b495
2 parents 3dde564 + 771f108
commit 8a5b495
Show file tree

Hide file tree

Showing 14 changed files with 1,500 additions and 0 deletions.
diff --git a/apps/chatbot-kickstarter/README.md b/apps/chatbot-kickstarter/README.md
@@ -0,0 +1,35 @@
+# Powering your products with ChatGPT and your own data
+
+The Chatbot Kickstarter is a starter repo to get you used to building basic a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to. 
+
+This repo contains one notebook and two basic Streamlit apps:
+- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
+- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
+- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
+
+To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
+
+## How it works
+
+The notebook is the best place to start, and is broadly laid out as follows:
+- **Lay the foundations:**
+ - Set up the vector database to accept vectors and data
+ - Load the dataset, chunk the data up for embedding and store in the vector database
+- **Make it a product:**
+ - Add a retrieval step where users provide queries and we return the most relevant entries
+ - Summarise search results with GPT-3
+ - Test out this basic Q&A app in Streamlit
+- **Build your moat:**
+ - Create an Assistant class to manage context and interact with our bot
+ - Use the Chatbot to answer questions using semantic search context
+ - Test out this basic Chatbot app in Streamlit
+
+Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
+
+## Limitations
+
+- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
+- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
+ - The prompt and parameters for the model for it to answer accurately
+ - Your search to return more relevant results
+ - Your chunking/embedding approach to store the most relevant content effectively for retrieval
diff --git a/apps/chatbot-kickstarter/chat.py b/apps/chatbot-kickstarter/chat.py
@@ -0,0 +1,83 @@
+import streamlit as st
+from streamlit_chat import message
+
+from database import get_redis_connection
+from chatbot import RetrievalAssistant, Message
+
+# Initialise database
+
+## Initialise Redis connection
+redis_client = get_redis_connection()
+
+# Set instruction
+
+# System prompt requiring Question and Year to be extracted from the user
+system_prompt = '''
+You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
+The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
+Think about this step by step:
+- The user will ask a Question
+- You will ask them for the Year if their question didn't include a Year
+- Once you have the Year, say "searching for answers".
+
+Example:
+
+User: I'd like to know the cost cap for a power unit
+
+Assistant: Certainly, what year would you like this for?
+
+User: 2023 please.
+
+Assistant: Searching for answers.
+'''
+
+### CHATBOT APP
+
+st.set_page_config(
+ page_title="Streamlit Chat - Demo",
+ page_icon=":robot:"
+)
+
+st.title('Formula 1 Chatbot')
+st.subheader("Help us help you learn about Formula 1")
+
+if 'generated' not in st.session_state:
+ st.session_state['generated'] = []
+
+if 'past' not in st.session_state:
+ st.session_state['past'] = []
+
+def query(question):
+ response = st.session_state['chat'].ask_assistant(question)
+ return response
+
+prompt = st.text_input("What do you want to know: ","", key="input")
+
+if st.button('Submit', key='generationSubmit'):
+
+ # Initialization
+ if 'chat' not in st.session_state:
+ st.session_state['chat'] = RetrievalAssistant()
+ messages = []
+ system_message = Message('system',system_prompt)
+ messages.append(system_message.message())
+ else:
+ messages = []
+
+
+ user_message = Message('user',prompt)
+ messages.append(user_message.message())
+
+ response = query(messages)
+
+ # Debugging step to print the whole response
+ #st.write(response)
+
+ st.session_state.past.append(prompt)
+ st.session_state.generated.append(response['content'])
+
+if st.session_state['generated']:
+
+ for i in range(len(st.session_state['generated'])-1, -1, -1):
+ message(st.session_state["generated"][i], key=str(i))
+ message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
diff --git a/apps/chatbot-kickstarter/chatbot.py b/apps/chatbot-kickstarter/chatbot.py
@@ -0,0 +1,84 @@
+import openai
+from termcolor import colored
+import streamlit as st
+
+from database import get_redis_connection,get_redis_results
+
+from config import CHAT_MODEL,COMPLETIONS_MODEL, INDEX_NAME
+
+redis_client = get_redis_connection()
+
+# A basic class to create a message as a dict for chat
+class Message:
+
+
+ def __init__(self,role,content):
+
+ self.role = role
+ self.content = content
+
+ def message(self):
+
+ return {"role": self.role,"content": self.content}
+
+# New Assistant class to add a vector database call to its responses
+class RetrievalAssistant:
+
+ def __init__(self):
+ self.conversation_history = [] 
+
+ def _get_assistant_response(self, prompt):
+
+ try:
+ completion = openai.ChatCompletion.create(
+ model=CHAT_MODEL,
+ messages=prompt,
+ temperature=0.1
+ )
+
+ response_message = Message(completion['choices'][0]['message']['role'],completion['choices'][0]['message']['content'])
+ return response_message.message()
+
+ except Exception as e:
+
+ return f'Request failed with exception {e}'
+
+ # The function to retrieve Redis search results
+ def _get_search_results(self,prompt):
+ latest_question = prompt
+ search_content = get_redis_results(redis_client,latest_question,INDEX_NAME)['result'][0]
+ return search_content
+
+
+ def ask_assistant(self, next_user_prompt):
+ [self.conversation_history.append(x) for x in next_user_prompt]
+ assistant_response = self._get_assistant_response(self.conversation_history)
+
+ # Answer normally unless the trigger sequence is used "searching_for_answers"
+ if 'searching for answers' in assistant_response['content'].lower():
+ question_extract = openai.Completion.create(model=COMPLETIONS_MODEL,prompt=f"Extract the user's latest question and the year for that question from this conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year")
+ search_result = self._get_search_results(question_extract['choices'][0]['text'])
+
+ # We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
+ # In this instance we add it to the conversation history, but in production it may be better to hide
+ self.conversation_history.insert(-1,{"role": 'system',"content": f"Answer the user's question using this content: {search_result}. If you cannot answer the question, say 'Sorry, I don't know the answer to this one'"})
+
+ assistant_response = self._get_assistant_response(self.conversation_history)
+
+ self.conversation_history.append(assistant_response)
+ return assistant_response
+ else:
+ self.conversation_history.append(assistant_response)
+ return assistant_response
+
+
+ def pretty_print_conversation_history(self, colorize_assistant_replies=True):
+ for entry in self.conversation_history:
+ if entry['role'] == 'system':
+ pass
+ else:
+ prefix = entry['role']
+ content = entry['content']
+ output = colored(prefix +':\n' + content, 'green') if colorize_assistant_replies and entry['role'] == 'assistant' else prefix +':\n' + content
+ #prefix = entry['role']
+ print(output)
diff --git a/apps/chatbot-kickstarter/config.py b/apps/chatbot-kickstarter/config.py
@@ -0,0 +1,7 @@
+COMPLETIONS_MODEL = "text-davinci-003"
+EMBEDDINGS_MODEL = "text-embedding-ada-002"
+CHAT_MODEL = 'gpt-3.5-turbo'
+TEXT_EMBEDDING_CHUNK_SIZE=300
+VECTOR_FIELD_NAME='content_vector'
+PREFIX = "sportsdoc" 
+INDEX_NAME = "f1-index"
diff --git a/...bot-kickstarter/data/FIA Practice Directions - Competitor's Staff Registration System.pdf b/...bot-kickstarter/data/FIA Practice Directions - Competitor's Staff Registration System.pdf
diff --git a/...bot-kickstarter/data/fia_2022_formula_1_sporting_regulations_-_issue_9_-_2022-10-19_0.pdf b/...bot-kickstarter/data/fia_2022_formula_1_sporting_regulations_-_issue_9_-_2022-10-19_0.pdf
diff --git a/...tbot-kickstarter/data/fia_2023_formula_1_technical_regulations_-_issue_4_-_2022-12-07.pdf b/...tbot-kickstarter/data/fia_2023_formula_1_technical_regulations_-_issue_4_-_2022-12-07.pdf
diff --git a/...chatbot-kickstarter/data/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf b/...chatbot-kickstarter/data/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf
diff --git a/apps/chatbot-kickstarter/data/fia_formula_1_financial_regulations_iss.13.pdf b/apps/chatbot-kickstarter/data/fia_formula_1_financial_regulations_iss.13.pdf
diff --git a/apps/chatbot-kickstarter/database.py b/apps/chatbot-kickstarter/database.py
@@ -0,0 +1,82 @@
+import pandas as pd 
+import numpy as np
+import openai
+from redis import Redis
+from redis.commands.search.field import VectorField
+from redis.commands.search.field import TextField, NumericField
+from redis.commands.search.query import Query
+
+from config import EMBEDDINGS_MODEL, PREFIX, VECTOR_FIELD_NAME
+
+# Get a Redis connection
+def get_redis_connection(host='localhost',port='6379',db=0):
+
+ r = Redis(host=host, port=port, db=db,decode_responses=False)
+ return r
+
+# Create a Redis index to hold our data
+def create_hnsw_index (redis_conn,vector_field_name,vector_dimensions=1536, distance_metric='COSINE'):
+ redis_conn.ft().create_index([
+ VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric}),
+ TextField("filename"),
+ TextField("text_chunk"), 
+ NumericField("file_chunk_index")
+ ])
+
+# Create a Redis pipeline to load all the vectors and their metadata
+def load_vectors(client:Redis, input_list, vector_field_name):
+ p = client.pipeline(transaction=False)
+ for text in input_list: 
+ #hash key
+ key=f"{PREFIX}:{text['id']}"
+
+ #hash values
+ item_metadata = text['metadata']
+ #
+ item_keywords_vector = np.array(text['vector'],dtype= 'float32').tobytes()
+ item_metadata[vector_field_name]=item_keywords_vector
+
+ # HSET
+ p.hset(key,mapping=item_metadata)
+
+ p.execute()
+
+# Make query to Redis
+def query_redis(redis_conn,query,index_name, top_k=2):
+
+
+
+ ## Creates embedding vector from user query
+ embedded_query = np.array(openai.Embedding.create(
+ input=query,
+ model=EMBEDDINGS_MODEL,
+ )["data"][0]['embedding'], dtype=np.float32).tobytes()
+
+ #prepare the query
+ q = Query(f'*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').sort_by('vector_score').paging(0,top_k).return_fields('vector_score','filename','text_chunk','text_chunk_index').dialect(2) 
+ params_dict = {"vec_param": embedded_query}
+
+
+ #Execute the query
+ results = redis_conn.ft(index_name).search(q, query_params = params_dict)
+
+ return results
+
+# Get mapped documents from Weaviate results
+def get_redis_results(redis_conn,query,index_name):
+
+ # Get most relevant documents from Redis
+ query_result = query_redis(redis_conn,query,index_name)
+
+ # Extract info into a list
+ query_result_list = []
+ for i, result in enumerate(query_result.docs):
+ result_order = i
+ text = result.text_chunk
+ score = result.vector_score
+ query_result_list.append((result_order,text,score))
+
+ # Display result as a DataFrame for ease of us
+ result_df = pd.DataFrame(query_result_list)
+ result_df.columns = ['id','result','certainty']
+ return result_df