-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve the accuracy of BM25 score when multiple data parts are involved
- Loading branch information
Mochi Xu
authored and
Shanfeng Pang
committed
Apr 22, 2024
1 parent
abbf1f8
commit 40f8339
Showing
16 changed files
with
412 additions
and
16 deletions.
There are no files selected for viewing
Submodule tantivy_search
updated
from 06a58e to eaee87
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#include <VectorIndex/Common/BM25InfoInDataParts.h> | ||
|
||
#include <Common/logger_useful.h> | ||
|
||
namespace DB | ||
{ | ||
|
||
#if USE_TANTIVY_SEARCH | ||
UInt64 BM25InfoInDataPart::getTotalDocsCount() const | ||
{ | ||
return total_docs; | ||
} | ||
|
||
UInt64 BM25InfoInDataPart::getTotalNumTokens() const | ||
{ | ||
return total_num_tokens; | ||
} | ||
|
||
const RustVecDocWithFreq & BM25InfoInDataPart::getTermWithDocNums() const | ||
{ | ||
return term_with_doc_nums; | ||
} | ||
|
||
|
||
UInt64 BM25InfoInDataParts::getTotalDocsCountAllParts() const | ||
{ | ||
UInt64 result = 0; | ||
for (const auto & part : *this) | ||
result += part.getTotalDocsCount(); | ||
return result; | ||
} | ||
|
||
UInt64 BM25InfoInDataParts::getTotalNumTokensAllParts() const | ||
{ | ||
UInt64 result = 0; | ||
for (const auto & part : *this) | ||
result += part.getTotalNumTokens(); | ||
return result; | ||
} | ||
|
||
RustVecDocWithFreq BM25InfoInDataParts::getTermWithDocNumsAllParts() const | ||
{ | ||
/// Add number of docs containing a term in all parts based on term name and column name | ||
using FieldIdAndTokenName = std::pair<UInt32, String>; | ||
std::map<FieldIdAndTokenName, UInt64> field_token_name_with_docs_map; | ||
for (const auto & part : *this) | ||
{ | ||
auto & doc_nums_in_part = part.getTermWithDocNums(); | ||
|
||
/// Loop through the vector of Vec<DocWithFreq> in a part | ||
for (auto & field_token_doc_freq : doc_nums_in_part) | ||
{ | ||
FieldIdAndTokenName field_token = FieldIdAndTokenName(field_token_doc_freq.field_id, field_token_doc_freq.term_str); | ||
field_token_name_with_docs_map[field_token] += field_token_doc_freq.doc_freq; | ||
} | ||
} | ||
|
||
RustVecDocWithFreq result; | ||
result.reserve(field_token_name_with_docs_map.size()); | ||
|
||
for (const auto & [col_token, doc_freq] : field_token_name_with_docs_map) | ||
result.push_back({col_token.second, col_token.first, doc_freq}); | ||
|
||
return result; | ||
} | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#pragma once | ||
|
||
#include <base/types.h> | ||
#include <vector> | ||
#include "config.h" | ||
|
||
#if USE_TANTIVY_SEARCH | ||
# include <tantivy_search.h> | ||
#endif | ||
|
||
namespace DB | ||
{ | ||
|
||
#if USE_TANTIVY_SEARCH | ||
|
||
using RustVecDocWithFreq = rust::cxxbridge1::Vec<DocWithFreq>; | ||
|
||
struct BM25InfoInDataPart | ||
{ | ||
UInt64 total_docs; /// Total number of documents in a data part | ||
UInt64 total_num_tokens; /// Total number of tokens from all documents in a data part | ||
RustVecDocWithFreq term_with_doc_nums; /// vector of terms with number of documents containing it | ||
|
||
BM25InfoInDataPart() = default; | ||
|
||
BM25InfoInDataPart( | ||
const UInt64 & total_docs_, | ||
const UInt64 & total_num_tokens_, | ||
const RustVecDocWithFreq & term_with_doc_nums_) | ||
: total_docs{total_docs_} | ||
, total_num_tokens{total_num_tokens_} | ||
, term_with_doc_nums{term_with_doc_nums_} | ||
{} | ||
|
||
UInt64 getTotalDocsCount() const; | ||
UInt64 getTotalNumTokens() const; | ||
const RustVecDocWithFreq & getTermWithDocNums() const; | ||
}; | ||
|
||
struct BM25InfoInDataParts: public std::vector<BM25InfoInDataPart> | ||
{ | ||
using std::vector<BM25InfoInDataPart>::vector; | ||
|
||
UInt64 getTotalDocsCountAllParts() const; | ||
UInt64 getTotalNumTokensAllParts() const; | ||
RustVecDocWithFreq getTermWithDocNumsAllParts() const; | ||
}; | ||
|
||
#endif | ||
} |
Oops, something went wrong.