diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index e02501a490..ef1e03781f 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -322,7 +322,8 @@ Fetch a document by specifying its text id string | get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` -Fetch documents by specifying a list of text id strings +Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead +to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. #### get\_metadata\_values\_by\_key diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index cac893d33f..04ec2b4fd9 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -363,9 +363,12 @@ def get_document_by_id(self, id: str, index: Optional[str] = None, headers: Opti return None def get_documents_by_id(self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]: - """Fetch documents by specifying a list of text id strings""" + """ + Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead + to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. + """ index = index or self.index - query = {"query": {"ids": {"values": ids}}} + query = {"size": len(ids), "query": {"ids": {"values": ids}}} result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"] documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result] return documents diff --git a/test/test_document_store.py b/test/test_document_store.py index 16fbe005e7..e960641b36 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -186,13 +186,30 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs) assert len(documents) == 0 -def test_get_documents_by_id(document_store_with_docs): +def test_get_document_by_id(document_store_with_docs): documents = document_store_with_docs.get_all_documents() doc = document_store_with_docs.get_document_by_id(documents[0].id) assert doc.id == documents[0].id assert doc.content == documents[0].content +def test_get_documents_by_id(document_store): + # generate more documents than the elasticsearch default query size limit of 10 + docs_to_generate = 15 + documents = [{'content': 'doc-' + str(i)} for i in range(docs_to_generate)] + doc_idx = 'green_fields' + document_store.write_documents(documents, index=doc_idx) + + all_docs = document_store.get_all_documents(index=doc_idx) + all_ids = [doc.id for doc in all_docs] + + retrieved_by_id = document_store.get_documents_by_id(all_ids, index=doc_idx) + retrieved_ids = [doc.id for doc in retrieved_by_id] + + # all documents in the index should be retrieved when passing all document ids in the index + assert set(retrieved_ids) == set(all_ids) + + def test_get_document_count(document_store): documents = [ {"content": "text1", "id": "1", "meta_field_for_count": "a"},