Skip to content

Commit

Permalink
Fixing broken BM25 support with Weaviate - fixes #3720 (#3723)
Browse files Browse the repository at this point in the history
* Fixing broken BM25 support with Weaviate - fixes #3720

Unfortunately the BM25 support with Weaviate got broken with Haystack v1.11.0+, which is getting fixed with this commit.

Please see more under issue #3720.

* Fixing mypy issue - method signature wasn't matching the base class

* Mypy related test fix

Mypy forced me to set the signature of the `query` method of the Weaviate document store to the same as its parent, the `KeywordDocumentStore`, where the `query` parame is `Optional`, but has NO default value, so it must be provided (as None) at runtime.
I am not quite sure why the abstract method's `query` param was set without a default value while its type is `Optional`, but I didn't want to change that, so instead I have changed the Weaviate tests.

* Adding a note regarding an upcomming fix in Weaviate v1.17.0

* Apply suggestions from code review

* revert

* [EMPTY] Re-trigger CI
  • Loading branch information
zoltan-fedor committed Dec 19, 2022
1 parent 56803e5 commit e143f7c
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 12 deletions.
138 changes: 133 additions & 5 deletions haystack/document_stores/weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
_optional_component_not_installed(__name__, "weaviate", ie)

from haystack.schema import Document, FilterType, Label
from haystack.document_stores import BaseDocumentStore
from haystack.document_stores import KeywordDocumentStore
from haystack.document_stores.base import get_batches_from_generator
from haystack.document_stores.filter_utils import LogicalFilterClause
from haystack.document_stores.utils import convert_date_to_rfc3339
from haystack.errors import DocumentStoreError
from haystack.errors import DocumentStoreError, HaystackError
from haystack.nodes.retriever import DenseRetriever


Expand All @@ -34,7 +34,7 @@ class WeaviateDocumentStoreError(DocumentStoreError):
pass


class WeaviateDocumentStore(BaseDocumentStore):
class WeaviateDocumentStore(KeywordDocumentStore):
"""
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
Expand Down Expand Up @@ -849,13 +849,13 @@ def get_all_documents_generator(

def query(
self,
query: Optional[str] = None,
query: Optional[str],
filters: Optional[FilterType] = None,
top_k: int = 10,
all_terms_must_match: bool = False,
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
all_terms_must_match: bool = False,
scale_score: bool = True,
) -> List[Document]:
"""
Expand Down Expand Up @@ -1034,6 +1034,134 @@ def query(

return documents

def query_batch(
self,
queries: List[str],
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
all_terms_must_match: bool = False,
scale_score: bool = True,
) -> List[List[Document]]:
"""
Scan through documents in DocumentStore and return a small number documents
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
This method lets you find relevant documents for a single query string (output: List of Documents), or a
a list of query strings (output: List of Lists of Documents).
:param queries: Single query or list of queries.
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return per query.
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
:param all_terms_must_match: Whether all terms of the query must match the document.
If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
Defaults to False.
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
"""
# TODO - This method currently just calls query multiple times. Adapt this once there is a batch querying
# endpoint in Weaviate, which is currently not available,
# see https://stackoverflow.com/questions/71558676/does-weaviate-support-bulk-query#comment126569547_71561939

documents = []

if isinstance(filters, list):
if len(filters) != len(queries):
raise HaystackError(
"Number of filters does not match number of queries. Please provide as many filters"
" as queries or a single filter that will be applied to each query."
)
else:
filters = [filters] * len(queries) if filters is not None else [{}] * len(queries)

# run each query against Weaviate separately and combine the returned documents
for query, cur_filters in zip(queries, filters):
cur_docs = self.query(
query=query,
filters=cur_filters,
top_k=top_k,
custom_query=custom_query,
index=index,
headers=headers,
all_terms_must_match=all_terms_must_match,
scale_score=scale_score,
)
documents.append(cur_docs)

return documents

def query_by_embedding(
self,
query_emb: np.ndarray,
Expand Down
6 changes: 3 additions & 3 deletions test/document_stores/test_weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,13 @@ def test_query(self, ds, documents):
# docs = ds.query(query_text, filters={"name": ["name_1"]})
# assert len(docs) == 1

docs = ds.query(filters={"name": ["name_0"]})
docs = ds.query(query=None, filters={"name": ["name_0"]})
assert len(docs) == 3

docs = ds.query(filters={"content": [query_text.lower()]})
docs = ds.query(query=None, filters={"content": [query_text.lower()]})
assert len(docs) == 3

docs = ds.query(filters={"content": ["baz"]})
docs = ds.query(query=None, filters={"content": ["baz"]})
assert len(docs) == 3

@pytest.mark.integration
Expand Down
33 changes: 29 additions & 4 deletions test/nodes/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
("embedding", "milvus"),
("bm25", "elasticsearch"),
("bm25", "memory"),
("bm25", "weaviate"),
("es_filter_only", "elasticsearch"),
("tfidf", "memory"),
],
indirect=True,
Expand All @@ -55,10 +57,23 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
if not isinstance(retriever_with_docs, (BM25Retriever, TfidfRetriever)):
document_store_with_docs.update_embeddings(retriever_with_docs)

res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].content == "My name is Carla and I live in Berlin"
assert len(res) == 5
assert res[0].meta["name"] == "filename1"
# NOTE: FilterRetriever simply returns all documents matching a filter,
# so without filters applied it does nothing
if not isinstance(retriever_with_docs, FilterRetriever):
# the BM25 implementation in Weaviate would NOT pick up the expected records
# just with the "Who lives in Berlin?" query, but would return empty results,
# (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching
# This was caused by lack of stemming and casing in Weaviate BM25 implementation
# TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released
# this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate.
# See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003
if isinstance(document_store_with_docs, WeaviateDocumentStore):
res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin")
else:
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].content == "My name is Carla and I live in Berlin"
assert len(res) == 5
assert res[0].meta["name"] == "filename1"


@pytest.mark.parametrize(
Expand All @@ -71,6 +86,8 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
("embedding", "elasticsearch"),
("embedding", "memory"),
("bm25", "elasticsearch"),
# TODO - add once Weaviate starts supporting filters with BM25 in Weaviate v1.18+
# ("bm25", "weaviate"),
("es_filter_only", "elasticsearch"),
],
indirect=True,
Expand Down Expand Up @@ -197,6 +214,14 @@ def test_batch_retrieval_multiple_queries_with_filters(retriever_with_docs, docu
if not isinstance(retriever_with_docs, (BM25Retriever, FilterRetriever)):
document_store_with_docs.update_embeddings(retriever_with_docs)

# Weaviate does not support BM25 with filters yet, only after Weaviate v1.18.0
# TODO - remove this once Weaviate starts supporting BM25 WITH filters
# You might also need to modify the first query, as Weaviate having problems with
# retrieving the "My name is Carla and I live in Berlin" record just with the
# "Who lives in Berlin?" BM25 query
if isinstance(document_store_with_docs, WeaviateDocumentStore):
return

res = retriever_with_docs.retrieve_batch(
queries=["Who lives in Berlin?", "Who lives in New York?"], filters=[{"name": "filename1"}, None]
)
Expand Down

0 comments on commit e143f7c

Please sign in to comment.