fix: get_documents_by_id should return docs for all passed ids (deeps…

…et-ai#2064) * doc store should return all documents matching ids passed to get_documents_by_id * test for get_document_by_id should be named correctly * add test for get_documents_by_id * Add latest docstring and tutorial changes * document es query limit * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
jamescalam · Jan 26, 2022 · 5b7e906 · 5b7e906
1 parent 0f34983
commit 5b7e906
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 4 deletions.
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
@@ -322,7 +322,8 @@ Fetch a document by specifying its text id string
  | get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
 ```
 
-Fetch documents by specifying a list of text id strings
+Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
+to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
 
 <a name="elasticsearch.ElasticsearchDocumentStore.get_metadata_values_by_key"></a>
 #### get\_metadata\_values\_by\_key

diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py
@@ -363,9 +363,12 @@ def get_document_by_id(self, id: str, index: Optional[str] = None, headers: Opti
  return None
 
  def get_documents_by_id(self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]:
- """Fetch documents by specifying a list of text id strings"""
+ """
+ Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
+ to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
+ """
  index = index or self.index
- query = {"query": {"ids": {"values": ids}}}
+ query = {"size": len(ids), "query": {"ids": {"values": ids}}}
  result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"]
  documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
  return documents

diff --git a/test/test_document_store.py b/test/test_document_store.py
@@ -186,13 +186,30 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
  assert len(documents) == 0
 
 
-def test_get_documents_by_id(document_store_with_docs):
+def test_get_document_by_id(document_store_with_docs):
  documents = document_store_with_docs.get_all_documents()
  doc = document_store_with_docs.get_document_by_id(documents[0].id)
  assert doc.id == documents[0].id
  assert doc.content == documents[0].content
 
 
+def test_get_documents_by_id(document_store):
+ # generate more documents than the elasticsearch default query size limit of 10
+ docs_to_generate = 15
+ documents = [{'content': 'doc-' + str(i)} for i in range(docs_to_generate)]
+ doc_idx = 'green_fields'
+ document_store.write_documents(documents, index=doc_idx)
+
+ all_docs = document_store.get_all_documents(index=doc_idx)
+ all_ids = [doc.id for doc in all_docs]
+
+ retrieved_by_id = document_store.get_documents_by_id(all_ids, index=doc_idx)
+ retrieved_ids = [doc.id for doc in retrieved_by_id]
+
+ # all documents in the index should be retrieved when passing all document ids in the index
+ assert set(retrieved_ids) == set(all_ids)
+
+
 def test_get_document_count(document_store):
  documents = [
  {"content": "text1", "id": "1", "meta_field_for_count": "a"},