Skip to content

Commit

Permalink
fix: get_documents_by_id should return docs for all passed ids (deeps…
Browse files Browse the repository at this point in the history
…et-ai#2064)

* doc store should return all documents matching ids passed to get_documents_by_id

* test for get_document_by_id should be named correctly

* add test for get_documents_by_id

* Add latest docstring and tutorial changes

* document es query limit

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
mathislucka and github-actions[bot] committed Jan 26, 2022
1 parent 0f34983 commit 5b7e906
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 4 deletions.
3 changes: 2 additions & 1 deletion docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,8 @@ Fetch a document by specifying its text id string
| get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```

Fetch documents by specifying a list of text id strings
Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.

<a name="elasticsearch.ElasticsearchDocumentStore.get_metadata_values_by_key"></a>
#### get\_metadata\_values\_by\_key
Expand Down
7 changes: 5 additions & 2 deletions haystack/document_stores/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,9 +363,12 @@ def get_document_by_id(self, id: str, index: Optional[str] = None, headers: Opti
return None

def get_documents_by_id(self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]:
"""Fetch documents by specifying a list of text id strings"""
"""
Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
"""
index = index or self.index
query = {"query": {"ids": {"values": ids}}}
query = {"size": len(ids), "query": {"ids": {"values": ids}}}
result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"]
documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
return documents
Expand Down
19 changes: 18 additions & 1 deletion test/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,30 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
assert len(documents) == 0


def test_get_documents_by_id(document_store_with_docs):
def test_get_document_by_id(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.content == documents[0].content


def test_get_documents_by_id(document_store):
# generate more documents than the elasticsearch default query size limit of 10
docs_to_generate = 15
documents = [{'content': 'doc-' + str(i)} for i in range(docs_to_generate)]
doc_idx = 'green_fields'
document_store.write_documents(documents, index=doc_idx)

all_docs = document_store.get_all_documents(index=doc_idx)
all_ids = [doc.id for doc in all_docs]

retrieved_by_id = document_store.get_documents_by_id(all_ids, index=doc_idx)
retrieved_ids = [doc.id for doc in retrieved_by_id]

# all documents in the index should be retrieved when passing all document ids in the index
assert set(retrieved_ids) == set(all_ids)


def test_get_document_count(document_store):
documents = [
{"content": "text1", "id": "1", "meta_field_for_count": "a"},
Expand Down

0 comments on commit 5b7e906

Please sign in to comment.