Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rag fusion rw 002 vector database #3

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement vector search using Chroma DB
  • Loading branch information
Richard committed Oct 17, 2023
commit 5e9e7f9f1288f6c9fd58caad4ef0456c3ecceda3
246 changes: 202 additions & 44 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import openai
import random
import chromadb
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S',filename='/tmp/ragfusion.log', filemode='w')


# Initialize OpenAI API
Expand Down Expand Up @@ -29,42 +33,155 @@ def generate_queries_chatgpt(original_query):

# Function to perform a vector search, returning random scores
def vector_search(query, collection):
# Perform the Chroma vector search for the given query
chroma_results = collection.query(
query_texts=[query],
n_results=4 # Adjust the number of results as needed
)

# Extract the relevant document texts from Chroma results
chroma_doc_texts = [result for result in chroma_results]
# Perform vector search
chroma_results = collection.query(
query_texts=[query],
n_results=4
)
logging.info(f"chroma_results: {chroma_results}")
logging.info(f"##############################################")

# Extract document IDs (flattened)
document_ids = chroma_results['ids'][0]
logging.info(f"document_ids: {document_ids}")
logging.info(f"##############################################")

# Retrieve documents
chroma_doc_texts = []
chroma_doc_metadata = []
# logging.info(f"chroma_doc_texts: {chroma_doc_texts}")
# logging.info(f"chroma_doc_metadata: {chroma_doc_metadata}")
# Initialize metadatas as a list
metadatas = []
documents = []

for doc_id in document_ids:
doc_info = collection.get(ids=[doc_id])
logging.info(f"doc_info 56: {doc_info}")

if doc_info:
#chroma_doc_texts.append(doc_info[0]['documents'])
#chroma_doc_metadata.append(doc_info[0]['metadata'])
chroma_doc_texts.append(doc_info['documents'][0])
chroma_doc_metadata.append(doc_info['metadatas'][0])
metadata = doc_info['metadatas'][0]
metadatas.append(metadata)
document = doc_info['documents'][0]
documents.append(document)

# Assign random scores to each document from Chroma results
scores = {doc_text: round(random.uniform(0.7, 0.9), 2) for doc_text in chroma_doc_texts}

return scores
else:
chroma_doc_texts.append("Document not found")
chroma_doc_metadata.append({})

document_ids = chroma_results['ids'][0]
logging.info(f"document_ids: {document_ids}")
# Extract titles
document_titles = [metadata.get("title", "Unknown Title") for metadata in chroma_doc_metadata]
logging.info(f"document_titles: {document_titles}")
logging.info(f"##################document_titles: {document_titles}")
# Generate scores dict
scores_dict = {
doc_id: round(random.uniform(0.7, 0.9), 2) for doc_id in document_ids
}
# Create scores
scores = {
"text": chroma_doc_texts,
"titles": document_titles,
"scores": scores_dict
}
logging.info(f"################## doc_info: {doc_info}")
# Extract just the score values into a list
score_values = list(scores_dict.values())
# Also build documents list
#documents = [doc_info['documents'][0][0] for doc_info in collection.get(document_ids)]
logging.info(f"################## documents: {documents}")

logging.info(f"scores: {scores}")
logging.info(f"############# Vector Search Output #################################")

# Log results
logging.info(f"scores: {scores}")
logging.info(f"document_ids: {document_ids}")
logging.info(f"##############################################")

#return scores, score_values, document_ids
#return scores, score_values, document_ids, metadatas
return scores, score_values, document_ids, metadatas, documents

# Reciprocal Rank Fusion algorithm
def reciprocal_rank_fusion(search_results_dict, k=60):
fused_scores = {}
print("Initial individual search result ranks:")
for query, doc_scores in search_results_dict.items():
print(f"For query '{query}': {doc_scores}")
def reciprocal_rank_fusion(all_results, document_ids, k=60):

fused_scores = {}

#for query, doc_scores in search_results_dict.items():
for query, result in all_results.items():

logging.info(f"For query '{query}': {result}")

#score_values = list(doc_scores.values())
score_values = result["score_values"]

for rank, score in enumerate(sorted(score_values)):

doc_id = document_ids[rank]

if doc_id not in fused_scores:
fused_scores[doc_id] = 0

for query, doc_scores in search_results_dict.items():
for rank, (doc, score) in enumerate(sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)):
if doc not in fused_scores:
fused_scores[doc] = 0
previous_score = fused_scores[doc]
fused_scores[doc] += 1 / (rank + k)
print(f"Updating score for {doc} from {previous_score} to {fused_scores[doc]} based on rank {rank} in query '{query}'")

reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
print("Final reranked results:", reranked_results)
return reranked_results

# Dummy function to simulate generative output
def generate_output(reranked_results, queries):
return f"Final output based on {queries} and reranked documents: {list(reranked_results.keys())}"
previous_score = fused_scores[doc_id]

fused_scores[doc_id] += 1 / (rank + k)

logging.info(f"Updating score for {doc_id} from {previous_score} to {fused_scores[doc_id]} based on rank {rank} in query '{query}'")

reranked_results = {doc_id: score for doc_id, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}

print("Final reranked results:", reranked_results)

return reranked_results

def generate_output(reranked_results, queries, metadatas, documents):
# Extract the top-ranked document ID and its score
top_document_id, top_score = next(iter(reranked_results.items()))

# Fetch the actual document title associated with the document ID
#top_document_title = document_titles.get(top_document_id, "Unknown Title")
logging.info(f"###################metadata###########################")
logging.info(f"metadatas: {metadatas}")
logging.info(f"##############################################")
top_doc_index = document_ids.index(top_document_id)
logging.info(f"top_doc_index: {top_doc_index}")
top_doc_metadata = metadatas[top_doc_index]

logging.info(f"top_doc_metadata: {top_doc_metadata}")

#logging.info(f"metadatas: {metadatas[top_document_id]}")
#top_document_metadata = metadatas[top_document_id]
#top_document_title = top_document_metadata.get("title", "Unknown Title")
top_document_title = top_doc_metadata.get("title", "Unknown Title")


#top_document_title = metadatas.get(top_document_id, {}).get("title", "Unknown Title")


# Generate_summary() generates a summary for the top document
document_summary = generate_summary(top_document_id, metadatas, documents) # Implement this function

# Generate a response with the correct document title, relevance score, summary, and original queries
response = f"Here is the most relevant information regarding '{queries[0]}':\n\n"
response += f"Document Title: {top_document_title}\n"
response += f"Relevance Score: {top_score}\n\n"
response += f"Summary of the document:\n{document_summary}\n\n"

response += "Original Queries:\n"
for query in queries:
response += f"- {query}\n"

response += "\nBy layering these technologies and techniques, RAG Fusion offers a powerful, nuanced approach to text generation. It leverages the best of search technology and generative AI to produce high-quality, reliable outputs."

return response

# Chroma: Create Collection in this case "all_documents"
collection = chroma_client.create_collection(name="all_documents")
Expand All @@ -90,16 +207,16 @@ def generate_output(reranked_results, queries):
"The history of climate change activism. The history of climate change activism is rich and inspiring. It began with grassroots movements and evolved into a global force for environmental awareness and policy change, demonstrating the power of collective action."
],
metadatas=[
{"source": "documents1"},
{"source": "documents2"},
{"source": "documents3"},
{"source": "documents4"},
{"source": "documents5"},
{"source": "documents6"},
{"source": "documents7"},
{"source": "documents8"},
{"source": "documents9"},
{"source": "documents10"}
{"source": "doc1", "title": "Climate change and economic impact"},
{"source": "doc2", "title": "Public health concerns due to climate change"},
{"source": "doc3", "title": "Climate change: A social perspective"},
{"source": "doc4", "title": "Technological solutions to climate change"},
{"source": "doc5", "title": "Policy changes needed to combat climate change"},
{"source": "doc6", "title": "Climate change and its impact on biodiversity"},
{"source": "doc7", "title": "Climate change: The science and models"},
{"source": "doc8", "title": "Global warming: A subset of climate change"},
{"source": "doc9", "title": "How climate change affects daily weather"},
{"source": "doc10", "title": "The history of climate change activism"}
],
ids=[
"doc1",
Expand All @@ -115,6 +232,28 @@ def generate_output(reranked_results, queries):
]
)

def generate_summary(document_id, metadatas, documents):
# Lookup document text
doc_index = document_ids.index(document_id)
logging.info(f"meta_data gensummm: {metadatas}")
logging.info(f"documents: {documents}")
#doc_text = metadatas[doc_index]['text']
# Define the prompt for generating the summary
prompt = f"Summarize the following document:\n{documents}\n\nSummary:"

# Use the GPT-3 model to generate the summary
response = openai.Completion.create(
engine="davinci",
prompt=prompt,
max_tokens=50, # Adjust the length of the summary as needed
stop=None, # Allow the model to generate the summary freely
temperature=0.7 # Adjust the temperature for creativity
)

# Extract the generated summary from the response
summary = response.choices[0].text.strip()

return summary

# Main function
if __name__ == "__main__":
Expand All @@ -125,11 +264,30 @@ def generate_output(reranked_results, queries):

all_results = {}
for query in generated_queries:
search_results = vector_search(query, collection) # Use the modified vector_search() function
all_results[query] = search_results

#scores, score_values, document_ids = vector_search(query, collection)
scores, score_values, document_ids, metadatas, documents= vector_search(query, collection)


all_results[query] = {
"scores": scores,
"score_values": score_values
}

reranked_results = reciprocal_rank_fusion(all_results)
logging.info(f"all_results: {all_results}")
logging.info(f"##############################################")
#reranked_results = reciprocal_rank_fusion(all_results)
reranked_results = reciprocal_rank_fusion(all_results, document_ids)
logging.info(f"##############################################")
print("Final reranked results <bottom>:", reranked_results)
logging.info(f"##############################################")

final_output = generate_output(reranked_results, generated_queries)
#final_output = generate_output(reranked_results, generated_queries)
#final_output = generate_output(reranked_results, generated_queries, document_titles)
final_output = generate_output(reranked_results, generated_queries, metadatas, documents)
#final_output = generate_output(reranked_results, generated_queries, metadatas)




print(final_output)