-
Notifications
You must be signed in to change notification settings - Fork 7
/
vectorize.py
60 lines (46 loc) · 1.74 KB
/
vectorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
To parse PDFs and save them as vectors in vector database
"""
import argparse
import os
from tqdm import tqdm
from langchain.schema import Document
from src.embeddings import build_base_embeddings
from src.llms import googlegenerativeai
from src.vectordb import load_pdf, text_split, save_faiss
from src.parser import get_title
from src.elements.raptor import Raptorizer
BASE_EMBEDDINGS = build_base_embeddings()
LLM = googlegenerativeai("gemini-1.5-flash")
RAPTORIZER = Raptorizer(BASE_EMBEDDINGS, LLM, "gemini")
VECTORDB_DIR = "./vectordb"
VECTORDB_TYPE = "faiss"
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--filepaths", type=str, nargs=argparse.ONE_OR_MORE)
args = parser.parse_args()
for filepath in tqdm(args.filepaths):
print(f"\nProcessing {filepath}")
parts = load_pdf(filepath)
title = get_title(parts)
print(f"Title: {title}")
docs = text_split(parts)
leaf_texts = [doc.page_content for doc in docs]
results = RAPTORIZER.recursive_embed_cluster_summarize(
leaf_texts, title, level=1, n_levels=3
)
metadata = docs[0].metadata.copy()
metadata.pop("page_number", None)
summarize_docs = []
for level in sorted(results.keys()):
summaries = results[level][1]["summaries"].tolist()
summaries = [
Document(page_content=text, metadata=metadata) for text in summaries
]
summarize_docs.extend(summaries)
docs.extend(summarize_docs)
dest = os.path.join(
VECTORDB_DIR,
VECTORDB_TYPE + "_" + os.path.splitext(os.path.basename(filepath))[0],
)
save_faiss(docs, BASE_EMBEDDINGS, dest)