PR fixes for colbert

stanfordnlp · arnavsinghvi11 · Jun 15, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
commit 81d142f2aea3d162a00051c4efabed3eabf96740
diff --git a/dsp/modules/colbertv2.py b/dsp/modules/colbertv2.py
@@ -75,24 +75,23 @@ def colbertv2_post_request_v2_wrapped(*args, **kwargs):
 
 
 colbertv2_post_request = colbertv2_post_request_v2_wrapped
-os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = "True"
 
 class ColBERTv2RetrieverLocal:
- def __init__(self,passages:List[str],colbert_config=None,load_only:bool=False,index_name:str="colbert_rm",checkpoint:str='colbert-ir/colbertv2.0'):
+ def __init__(self,passages:List[str],colbert_config=None,load_only:bool=False):
  """Colbertv2 retriever module
 
  Args:
  passages (List[str]): list of passages
- load_only (bool, optional): whether to load the index or . Defaults to False.
- index_name (str, optional): name of the index. Defaults to "colbert_rm".
- checkpoint (str, optional): checkpoint for generating embeddings. Defaults to 'colbert-ir/colbertv2.0'.
- colbert_config (ColBERTConfig, optional): colbert config for building and searching. Defaults to ColBERTConfig().
+ colbert_config (ColBERTConfig, optional): colbert config for building and searching. Defaults to None.
+ load_only (bool, optional): whether to load the index or build and then load. Defaults to False.
  """
- self.checkpoint = checkpoint
+ assert colbert_config is not None, "Please pass a valid colbert_config, which you can import from colbert.infra.config import ColBERTConfig and modify it"
  self.colbert_config = colbert_config
- self.colbert_config.index_name = index_name
- self.checkpoint = checkpoint
- self.colbert_config.checkpoint = checkpoint
+
+ assert self.colbert_config.checkpoint is not None, "Please pass a valid checkpoint like colbert-ir/colbertv2.0, which you can modify in the ColBERTConfig with attribute name checkpoint"
+ self.passages = passages
+
+ assert self.colbert_config.index_name is not None, "Please pass a valid index_name, which you can modify in the ColBERTConfig with attribute name index_name"
  self.passages = passages
 
  if not load_only:
@@ -112,7 +111,7 @@ def build_index(self):
  from colbert import Indexer
  from colbert.infra import Run, RunConfig
  with Run().context(RunConfig(nranks=self.colbert_config.nranks, experiment=self.colbert_config.experiment)): 
- indexer = Indexer(checkpoint=self.checkpoint, config=self.colbert_config)
+ indexer = Indexer(checkpoint=self.colbert_config.checkpoint, config=self.colbert_config)
  indexer.index(name=self.colbert_config.index_name, collection=self.passages, overwrite=True)
 
  def get_index(self):
@@ -128,7 +127,10 @@ def get_index(self):
  searcher = Searcher(index=self.colbert_config.index_name, collection=self.passages)
  return searcher
 
- def __call__(self,query:str,k:int=7,**kwargs):
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
+ return self.forward(*args, **kwargs)
+
+ def forward(self,query:str,k:int=7,**kwargs):
  import torch
 
  if kwargs.get("filtered_pids"):
@@ -146,7 +148,7 @@ def __call__(self,query:str,k:int=7,**kwargs):
  searcher_results = self.searcher.search(query, k=k)
  results = []
  for pid,rank,score in zip(*searcher_results):
- results.append(dotdict({'long_text':self.searcher.collection[pid],'score':score,'pid':pid}))
+ results.append(dotdict({'long_text':self.searcher.collection[pid],'pid':pid}))
  return results
 
 class ColBERTv2RerankerLocal:
@@ -159,31 +161,31 @@ def __init__(self,colbert_config=None,checkpoint:str='bert-base-uncased'):
  """_summary_
 
  Args:
+ colbert_config (ColBERTConfig, optional): Colbert config. Defaults to None.
  checkpoint_name (str, optional): checkpoint for embeddings. Defaults to 'bert-base-uncased'.
- colbert_config (ColBERTConfig, optional): Colbert config. Defaults to ColBERTConfig().
  """
  self.colbert_config = colbert_config
  self.checkpoint_name = checkpoint
  self.colbert_config.checkpoint = checkpoint
 
- # def __call__(self, *args: Any, **kwargs: Any) -> Any:
- # return self.forward(*args, **kwargs)
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
+ return self.forward(*args, **kwargs)
+
+ def forward(self,query:str,passages:List[str]=[]):
+ assert len(passages) > 0, "Passages should not be empty"
 
- def __call__(self,query:str,passages:List[str]=[]):
  import numpy as np
  from colbert.modeling.colbert import ColBERT
  from colbert.modeling.tokenization.doc_tokenization import DocTokenizer
  from colbert.modeling.tokenization.query_tokenization import QueryTokenizer
- assert len(passages) > 0, "Passages should not be empty"
+
  self.colbert_config.nway = len(passages)
  query_tokenizer = QueryTokenizer(self.colbert_config,verbose=1)
  doc_tokenizer = DocTokenizer(self.colbert_config)
  query_ids,query_masks = query_tokenizer.tensorize([query])
  doc_ids,doc_masks = doc_tokenizer.tensorize(passages)
 
  col = ColBERT(self.checkpoint_name,self.colbert_config) 
- # col.colbert_config.nway = len(passages)
- # tensor_scores = col([query_ids,query_masks],[doc_ids,doc_masks])
  Q = col.query(query_ids,query_masks)
  DOC_IDS,DOC_MASKS = col.doc(doc_ids,doc_masks,keep_dims='return_mask')
  Q_duplicated = Q.repeat_interleave(len(passages), dim=0).contiguous()

diff --git a/dsp/primitives/search.py b/dsp/primitives/search.py
@@ -11,20 +11,12 @@ def retrieve(query: str, k: int, **kwargs) -> list[str]:
  if not dsp.settings.rm:
  raise AssertionError("No RM is loaded.")
  if not dsp.settings.reranker:
- warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank")
+ warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank",DeprecationWarning)
  passages = dsp.settings.rm(query, k=k, **kwargs)
  if not isinstance(passages, Iterable):
  # it's not an iterable yet; make it one.
  # TODO: we should unify the type signatures of dspy.Retriever
  passages = [passages]
- # passages = [psg.long_text for psg in passages]
-
- # if dsp.settings.reranker:
- # passages_tracking_idx = {str(idx):psg for idx, psg in enumerate(passages)}
- # passages_long_text = [psg.long_text for psg in passages]
- # passages_cs_scores = dsp.settings.reranker(query, passages_long_text)
- # passages_cs_scores_sorted = np.argsort(passages_cs_scores)[::-1]
- # passages = [passages_long_text[idx] for idx in passages_cs_scores_sorted]
 
  return passages
 
@@ -48,31 +40,14 @@ def retrieveRerankEnsemble(queries: list[str], k: int,**kwargs) -> list[str]:
  else:
  return all_queries_passages
 
-# def retrieveRerankEnsemble(queries: list[str], k: int,**kwargs) -> list[str]:
-# if not (dsp.settings.rm and dsp.settings.reranker):
-# raise AssertionError("Both RM and Reranker are needed to retrieve & re-rank.")
-# queries = [q for q in queries if q]
-# passages = {}
-# for query in queries:
-# retrieved_passages = dsp.settings.rm(query, k=k*3,**kwargs)
-# passages_cs_scores = dsp.settings.reranker(query, [psg.long_text for psg in retrieved_passages])
-# for idx in np.argsort(passages_cs_scores)[::-1]:
-# psg = retrieved_passages[idx]
-# passages[psg.long_text] = passages.get(psg.long_text, []) + [
-# passages_cs_scores[idx],
-# ]
-
-# passages = [(np.average(score), text) for text, score in passages.items()]
-# return [text for _, text in sorted(passages, reverse=True)[:k]]
-
 def retrieveEnsemble(queries: list[str], k: int, by_prob: bool = True,**kwargs) -> list[str]:
  """Retrieves passages from the RM for each query in queries and returns the top k passages
  based on the probability or score.
  """
  if not dsp.settings.rm:
  raise AssertionError("No RM is loaded.")
  if not dsp.settings.reranker:
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
 "DeprecationWarning: 'display' has been deprecated. To see all information for debugging, use 'dspy.set_log_level('debug')'. In the future this will raise an error.", 
- warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank. The reranking is ignored here.")
+ warnings.warn("If you want to use the Reranker, please use dspy.RetrieveThenRerank. The reranking is ignored here.",DeprecationWarning)
 
  queries = [q for q in queries if q]
 
@@ -82,17 +57,13 @@ def retrieveEnsemble(queries: list[str], k: int, by_prob: bool = True,**kwargs)
  for q in queries:
  passages = {}
  retrieved_passages = dsp.settings.rm(q, k=k * 3,**kwargs)
- # for idx,psg in enumerate(retrieved_passages):
- # retrieved_passages[idx]["tracking_idx"] = idx
  for idx,psg in enumerate(retrieved_passages):
  if by_prob:
  passages[(idx,psg.long_text)] = passages.get(psg.long_text, 0.0) + psg.prob
  else:
  passages[(idx,psg.long_text)] = passages.get(psg.long_text, 0.0) + psg.score
  retrieved_passages[idx]["tracking_idx"] = idx
- # passages = [(score, text) for text, score in passages.items()]
  passages = sorted(passages.items(), key=lambda item: item[1])[:k]
- # passages = sorted(passages, reverse=True)[:k]
  req_indices = [psg[0][0] for psg in passages]
  passages = [rp for rp in retrieved_passages if rp.get("tracking_idx") in req_indices]
  all_queries_passages.append(passages)

diff --git a/dspy/retrieve/retrieve.py b/dspy/retrieve/retrieve.py
@@ -59,8 +59,7 @@ def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = No
  if "long_text" in passages_dict:
  passages_dict["passages"] = passages_dict.pop("long_text")
  return Prediction(**passages_dict)
- # elif isinstance(passages,List):
- # return Prediction(passages=passages)
+
 # TODO: Consider doing Prediction.from_completions with the individual sets of passages (per query) too.
 
 class RetrieveThenRerank(Parameter):
@@ -83,10 +82,10 @@ def load_state(self, state):
  for name, value in state.items():
  setattr(self, name, value)
 
- # def __call__(self, *args, **kwargs):
- #  return self.forward(*args, **kwargs)
+ def __call__(self, *args, **kwargs):
+ return self.forward(*args, **kwargs)
 
- def __call__(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None,**kwargs) -> Union[Prediction,List[Prediction]]:
+ def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None,**kwargs) -> Union[Prediction,List[Prediction]]:
  queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
  queries = [query.strip().split('\n')[0].strip() for query in queries]