stanfordnlp · arnavsinghvi11 · Jun 15, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/dsp/modules/__init__.py b/dsp/modules/__init__.py
@@ -8,7 +8,7 @@
 from .cache_utils import *
 from .clarifai import *
 from .cohere import *
-from .colbertv2 import ColBERTv2
+from .colbertv2 import ColBERTv2, ColBERTv2RerankerLocal, ColBERTv2RetrieverLocal
 from .databricks import *
 from .google import *
 from .googlevertexai import *

diff --git a/dsp/modules/colbertv2.py b/dsp/modules/colbertv2.py
@@ -1,5 +1,6 @@
 import functools
-from typing import Any, Optional, Union
+import os
+from typing import Any, List, Optional, Union
 
 import requests
 
@@ -74,3 +75,118 @@ def colbertv2_post_request_v2_wrapped(*args, **kwargs):
 
 
 colbertv2_post_request = colbertv2_post_request_v2_wrapped
+os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = "True"
+
+class ColBERTv2RetrieverLocal:
+ def __init__(self,passages:List[str],colbert_config=None,load_only:bool=False,index_name:str="colbert_rm",checkpoint:str='colbert-ir/colbertv2.0'):
+ """Colbertv2 retriever module
+
+ Args:
+ passages (List[str]): list of passages
+ load_only (bool, optional): whether to load the index or . Defaults to False.
+ index_name (str, optional): name of the index. Defaults to "colbert_rm".
+ checkpoint (str, optional): checkpoint for generating embeddings. Defaults to 'colbert-ir/colbertv2.0'.
+ colbert_config (ColBERTConfig, optional): colbert config for building and searching. Defaults to ColBERTConfig().
+ """
+ self.checkpoint = checkpoint
+ self.colbert_config = colbert_config
+ self.colbert_config.index_name = index_name
+ self.checkpoint = checkpoint
+ self.colbert_config.checkpoint = checkpoint
+ self.passages = passages
+
+ if not load_only:
+ print(f"Building the index for experiment {self.colbert_config.experiment} with index name {self.colbert_config.index_name}")
+ self.build_index()
+
+ print(f"Loading the index for experiment {self.colbert_config.experiment} with index name {self.colbert_config.index_name}")
+ self.searcher = self.get_index()
+
+ def build_index(self):
+
+ try:
+ import colbert
+ except ImportError:
+ print("Colbert not found. Please check your installation or install the module using pip install colbert-ai[faiss-gpu,torch].")
+
+ from colbert import Indexer
+ from colbert.infra import Run, RunConfig
+ with Run().context(RunConfig(nranks=self.colbert_config.nranks, experiment=self.colbert_config.experiment)): 
+ indexer = Indexer(checkpoint=self.checkpoint, config=self.colbert_config)
+ indexer.index(name=self.colbert_config.index_name, collection=self.passages, overwrite=True)
+
+ def get_index(self):
+ try:
+ import colbert
+ except ImportError:
+ print("Colbert not found. Please check your installation or install the module using pip install colbert-ai[faiss-gpu,torch].")
+
+ from colbert import Searcher
+ from colbert.infra import Run, RunConfig
+
+ with Run().context(RunConfig(experiment=self.colbert_config.experiment)):
+ searcher = Searcher(index=self.colbert_config.index_name, collection=self.passages)
+ return searcher
+
+ def __call__(self,query:str,k:int=7,**kwargs):
+ import torch
+
+ if kwargs.get("filtered_pids"):
+ filtered_pids = kwargs.get("filtered_pids")
+ assert type(filtered_pids) == List[int], "The filtered pids should be a list of integers"
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ results = self.searcher.search(
+ query,
+ #Number of passages to receive
+ k=k, 
+ #Passing the filter function of relevant 
+ filter_fn=lambda pids: torch.tensor(
+ [pid for pid in pids if pid in filtered_pids],dtype=torch.int32).to(device))
+ else:
+ searcher_results = self.searcher.search(query, k=k)
+ results = []
+ for pid,rank,score in zip(*searcher_results):
+ results.append(dotdict({'long_text':self.searcher.collection[pid],'score':score,'pid':pid}))
+ return results
+
+class ColBERTv2RerankerLocal:
+
+ def __init__(self,colbert_config=None,checkpoint:str='bert-base-uncased'):
+ try:
+ import colbert
+ except ImportError:
+ print("Colbert not found. Please check your installation or install the module using pip install colbert-ai[faiss-gpu,torch].")
+ """_summary_
+
+ Args:
+ checkpoint_name (str, optional): checkpoint for embeddings. Defaults to 'bert-base-uncased'.
+ colbert_config (ColBERTConfig, optional): Colbert config. Defaults to ColBERTConfig().
+ """
+ self.colbert_config = colbert_config
+ self.checkpoint_name = checkpoint
+ self.colbert_config.checkpoint = checkpoint
+
+ # def __call__(self, *args: Any, **kwargs: Any) -> Any:
+ # return self.forward(*args, **kwargs)
+
+ def __call__(self,query:str,passages:List[str]=[]):
+ import numpy as np
+ from colbert.modeling.colbert import ColBERT
+ from colbert.modeling.tokenization.doc_tokenization import DocTokenizer
+ from colbert.modeling.tokenization.query_tokenization import QueryTokenizer
+ assert len(passages) > 0, "Passages should not be empty"
+ self.colbert_config.nway = len(passages)
+ query_tokenizer = QueryTokenizer(self.colbert_config,verbose=1)
+ doc_tokenizer = DocTokenizer(self.colbert_config)
+ query_ids,query_masks = query_tokenizer.tensorize([query])
+ doc_ids,doc_masks = doc_tokenizer.tensorize(passages)
+
+ col = ColBERT(self.checkpoint_name,self.colbert_config) 
+ # col.colbert_config.nway = len(passages)
+ # tensor_scores = col([query_ids,query_masks],[doc_ids,doc_masks])
+ Q = col.query(query_ids,query_masks)
+ DOC_IDS,DOC_MASKS = col.doc(doc_ids,doc_masks,keep_dims='return_mask')
+ Q_duplicated = Q.repeat_interleave(len(passages), dim=0).contiguous()
+ tensor_scores = col.score(Q_duplicated,DOC_IDS,DOC_MASKS)
+ passage_score_arr = np.array([score.cpu().detach().numpy().tolist() for score in tensor_scores])
+ return passage_score_arr
diff --git a/dsp/primitives/search.py b/dsp/primitives/search.py
@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Iterable
 
 import numpy as np
@@ -9,17 +10,21 @@ def retrieve(query: str, k: int, **kwargs) -> list[str]:
  """Retrieves passages from the RM for the query and returns the top k passages."""
  if not dsp.settings.rm:
  raise AssertionError("No RM is loaded.")
+ if not dsp.settings.reranker:
+ warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank")
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
  passages = dsp.settings.rm(query, k=k, **kwargs)
  if not isinstance(passages, Iterable):
  # it's not an iterable yet; make it one.
  # TODO: we should unify the type signatures of dspy.Retriever
  passages = [passages]
- passages = [psg.long_text for psg in passages]
+ # passages = [psg.long_text for psg in passages]
 
- if dsp.settings.reranker:
- passages_cs_scores = dsp.settings.reranker(query, passages)
- passages_cs_scores_sorted = np.argsort(passages_cs_scores)[::-1]
- passages = [passages[idx] for idx in passages_cs_scores_sorted]
+ # if dsp.settings.reranker:
+ # passages_tracking_idx = {str(idx):psg for idx, psg in enumerate(passages)}
+ # passages_long_text = [psg.long_text for psg in passages]
+ # passages_cs_scores = dsp.settings.reranker(query, passages_long_text)
+ # passages_cs_scores_sorted = np.argsort(passages_cs_scores)[::-1]
+ # passages = [passages_long_text[idx] for idx in passages_cs_scores_sorted]
 
  return passages
 
@@ -28,44 +33,67 @@ def retrieveRerankEnsemble(queries: list[str], k: int,**kwargs) -> list[str]:
  if not (dsp.settings.rm and dsp.settings.reranker):
  raise AssertionError("Both RM and Reranker are needed to retrieve & re-rank.")
  queries = [q for q in queries if q]
- passages = {}
+ all_queries_passages = []
  for query in queries:
+ passages = []
  retrieved_passages = dsp.settings.rm(query, k=k*3,**kwargs)
- passages_cs_scores = dsp.settings.reranker(query, [psg.long_text for psg in retrieved_passages])
- for idx in np.argsort(passages_cs_scores)[::-1]:
- psg = retrieved_passages[idx]
- passages[psg.long_text] = passages.get(psg.long_text, []) + [
- passages_cs_scores[idx],
- ]
+ passages_cs_scores = dsp.settings.reranker(query,passages=[psg["long_text"] for psg in retrieved_passages])
+ for idx in np.argsort(passages_cs_scores)[::-1][:k]:
+ curr_passage = retrieved_passages[idx]
+ curr_passage['rerank_score'] = passages_cs_scores[idx]
+ passages.append(curr_passage)
+ all_queries_passages.append(passages)
+ if len(queries) == 1:
+ return all_queries_passages[0]
+ else:
+ return all_queries_passages
 
- passages = [(np.average(score), text) for text, score in passages.items()]
- return [text for _, text in sorted(passages, reverse=True)[:k]]
+# def retrieveRerankEnsemble(queries: list[str], k: int,**kwargs) -> list[str]:
+# if not (dsp.settings.rm and dsp.settings.reranker):
+# raise AssertionError("Both RM and Reranker are needed to retrieve & re-rank.")
+# queries = [q for q in queries if q]
+# passages = {}
+# for query in queries:
+# retrieved_passages = dsp.settings.rm(query, k=k*3,**kwargs)
+# passages_cs_scores = dsp.settings.reranker(query, [psg.long_text for psg in retrieved_passages])
+# for idx in np.argsort(passages_cs_scores)[::-1]:
+# psg = retrieved_passages[idx]
+# passages[psg.long_text] = passages.get(psg.long_text, []) + [
+# passages_cs_scores[idx],
+# ]
 
+# passages = [(np.average(score), text) for text, score in passages.items()]
+# return [text for _, text in sorted(passages, reverse=True)[:k]]
 
 def retrieveEnsemble(queries: list[str], k: int, by_prob: bool = True,**kwargs) -> list[str]:
  """Retrieves passages from the RM for each query in queries and returns the top k passages
  based on the probability or score.
  """
  if not dsp.settings.rm:
  raise AssertionError("No RM is loaded.")
- if dsp.settings.reranker:
- return retrieveRerankEnsemble(queries, k)
+ if not dsp.settings.reranker:
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
+ warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank. The reranking is ignored here.")
 
  queries = [q for q in queries if q]
 
  if len(queries) == 1:
- return retrieve(queries[0], k, **kwargs)
-
- passages = {}
+ return retrieve(queries[0], k)
+ all_queries_passages = []
  for q in queries:
- for psg in dsp.settings.rm(q, k=k * 3,**kwargs):
+ passages = {}
+ retrieved_passages = dsp.settings.rm(q, k=k * 3,**kwargs)
+ # for idx,psg in enumerate(retrieved_passages):
+ # retrieved_passages[idx]["tracking_idx"] = idx
+ for idx,psg in enumerate(retrieved_passages):
  if by_prob:
- passages[psg.long_text] = passages.get(psg.long_text, 0.0) + psg.prob
+ passages[(idx,psg.long_text)] = passages.get(psg.long_text, 0.0) + psg.prob
  else:
- passages[psg.long_text] = passages.get(psg.long_text, 0.0) + psg.score
-
- passages = [(score, text) for text, score in passages.items()]
- passages = sorted(passages, reverse=True)[:k]
- passages = [text for _, text in passages]
-
- return passages
+ passages[(idx,psg.long_text)] = passages.get(psg.long_text, 0.0) + psg.score
+ retrieved_passages[idx]["tracking_idx"] = idx
+ # passages = [(score, text) for text, score in passages.items()]
+ passages = sorted(passages.items(), key=lambda item: item[1])[:k]
+ # passages = sorted(passages, reverse=True)[:k]
+ req_indices = [psg[0][0] for psg in passages]
+ passages = [rp for rp in retrieved_passages if rp.get("tracking_idx") in req_indices]
+ all_queries_passages.append(passages)
+ return all_queries_passages
diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -20,6 +20,8 @@
 Databricks = dsp.Databricks
 Cohere = dsp.Cohere
 ColBERTv2 = dsp.ColBERTv2
+ColBERTv2RerankerLocal = dsp.ColBERTv2RerankerLocal
+ColBERTv2RetrieverLocal = dsp.ColBERTv2RetrieverLocal
 Pyserini = dsp.PyseriniRetriever
 Clarifai = dsp.ClarifaiLLM
 Google = dsp.Google

diff --git a/dspy/retrieve/__init__.py b/dspy/retrieve/__init__.py
@@ -1 +1 @@
-from .retrieve import Retrieve
+from .retrieve import Retrieve, RetrieveThenRerank
diff --git a/dspy/retrieve/retrieve.py b/dspy/retrieve/retrieve.py
@@ -1,5 +1,5 @@
 import random
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import dsp
 from dspy.predict.parameter import Parameter
@@ -29,14 +29,89 @@ def load_state(self, state):
  def __call__(self, *args, **kwargs):
  return self.forward(*args, **kwargs)
 
- def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None,**kwargs) -> Prediction:
+ def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None,**kwargs) -> Union[Prediction,List[Prediction]]:
  queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
  queries = [query.strip().split('\n')[0].strip() for query in queries]
 
  # print(queries)
  # TODO: Consider removing any quote-like markers that surround the query too.
  k = k if k is not None else self.k
  passages = dsp.retrieveEnsemble(queries, k=k,**kwargs)
- return Prediction(passages=passages)
-
+ if isinstance(passages[0],List):
+ pred_returns = []
+ for query_passages in passages:
+ passages_dict = {key:[] for key in list(query_passages[0].keys()) if key!="tracking_idx"}
+ for psg in query_passages:
+ for key,value in psg.items():
+ if key == "tracking_idx": continue
+ passages_dict[key].append(value)
+ if "long_text" in passages_dict:
+ passages_dict["passages"] = passages_dict.pop("long_text")
+ pred_returns.append(Prediction(**passages_dict)) 
+ return pred_returns
+ elif isinstance(passages[0], Dict):
+ #passages dict will contain {"long_text":long_text_list,"metadatas";metadatas_list...}
+ passages_dict = {key:[] for key in list(passages[0].keys())}
+
+ for psg in passages:
+ for key,value in psg.items():
+ passages_dict[key].append(value)
+ if "long_text" in passages_dict:
+ passages_dict["passages"] = passages_dict.pop("long_text")
+ return Prediction(**passages_dict)
+ # elif isinstance(passages,List):
+ # return Prediction(passages=passages)
 # TODO: Consider doing Prediction.from_completions with the individual sets of passages (per query) too.
+
+class RetrieveThenRerank(Parameter):
+ name = "Search"
+ input_variable = "query"
+ desc = "takes a search query and returns one or more potentially relevant passages followed by reranking from a corpus"
+
+ def __init__(self, k=3):
+ self.stage = random.randbytes(8).hex()
+ self.k = k
+
+ def reset(self):
+ pass
+
+ def dump_state(self):
+ state_keys = ["k"]
+ return {k: getattr(self, k) for k in state_keys}
+
+ def load_state(self, state):
+ for name, value in state.items():
+ setattr(self, name, value)
+
+ # def __call__(self, *args, **kwargs):
+ # return self.forward(*args, **kwargs)
+
+ def __call__(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None,**kwargs) -> Union[Prediction,List[Prediction]]:
+ queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
+ queries = [query.strip().split('\n')[0].strip() for query in queries]
+
+ # print(queries)
+ # TODO: Consider removing any quote-like markers that surround the query too.
+ k = k if k is not None else self.k
+ passages = dsp.retrieveRerankEnsemble(queries, k=k,**kwargs)
+ if isinstance(passages[0],List):
+ pred_returns = []
+ for query_passages in passages:
+ passages_dict = {key:[] for key in list(query_passages[0].keys())}
+ for docs in query_passages:
+ for key,value in docs.items():
+ passages_dict[key].append(value)
+ if "long_text" in passages_dict:
+ passages_dict["passages"] = passages_dict.pop("long_text") 
+
+ pred_returns.append(Prediction(**passages_dict)) 
+ return pred_returns
+ elif isinstance(passages[0], Dict):
+ passages_dict = {key:[] for key in list(passages[0].keys())}
+ for docs in passages:
+ for key,value in docs.items():
+ passages_dict[key].append(value)
+ if "long_text" in passages_dict:
+ passages_dict["passages"] = passages_dict.pop("long_text")
+ return Prediction(**passages_dict)
+