Skip to content

Commit

Permalink
Support hybrid search, full-text search on tantivy inverted index
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangjmruc authored and Shanfeng Pang committed Apr 16, 2024
1 parent 0f4d5eb commit 300e872
Show file tree
Hide file tree
Showing 122 changed files with 10,155 additions and 1,703 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -338,3 +338,6 @@
[submodule "contrib/search-index"]
path = contrib/search-index
url = https://github.com/myscale/search-index.git
[submodule "rust/supercrate/libs/tantivy_search"]
path = rust/supercrate/libs/tantivy_search
url = [email protected]:mqdb/tantivy-search.git
22 changes: 22 additions & 0 deletions cmake/dbms_glob_sources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,25 @@ endmacro()
macro(add_headers_only prefix common_path)
add_glob(${prefix}_headers ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h)
endmacro()

function(remove_specific_headers_and_sources prefix)
set(files_to_remove ${ARGN})
set(root_path "${CMAKE_CURRENT_SOURCE_DIR}")

foreach(file IN LISTS files_to_remove)
get_filename_component(full_path "${file}" ABSOLUTE)
if("${full_path}" MATCHES "^${root_path}")
file(RELATIVE_PATH relative_path "${root_path}" "${full_path}")
set(file_to_remove "${relative_path}")
else()
set(file_to_remove "${file}")
endif()
# remove files from headers and sources
list(REMOVE_ITEM ${prefix}_headers "${file_to_remove}")
list(REMOVE_ITEM ${prefix}_sources "${file_to_remove}")
endforeach()

# update headers and sources
set(${prefix}_headers ${${prefix}_headers} PARENT_SCOPE)
set(${prefix}_sources ${${prefix}_sources} PARENT_SCOPE)
endfunction()
2 changes: 1 addition & 1 deletion contrib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ add_contrib (protobuf-cmake protobuf)
add_contrib (openldap-cmake openldap)
add_contrib (grpc-cmake grpc)
add_contrib (msgpack-c-cmake msgpack-c)

# rust toolchain
add_contrib (corrosion-cmake corrosion)

if (ENABLE_FUZZING)
Expand Down
2 changes: 1 addition & 1 deletion contrib/corrosion
Submodule corrosion updated 209 files
2 changes: 1 addition & 1 deletion contrib/croaring
Submodule croaring updated 88 files
+8 −0 .github/dependabot.yml
+2 −2 .github/workflows/alpine.yml
+3 −3 .github/workflows/cifuzz.yml
+4 −5 .github/workflows/codeql.yml
+2 −2 .github/workflows/documentation.yml
+12 −3 .github/workflows/macos-ci.yml
+4 −4 .github/workflows/s390x.yml
+72 −0 .github/workflows/scorecard.yml
+5 −5 .github/workflows/ubuntu-ci.yml
+2 −2 .github/workflows/ubuntu-debug-sani-ci.yml
+3 −3 .github/workflows/ubuntu-gcc10-ci.yml
+4 −4 .github/workflows/ubuntu-legacy-ci.yml
+2 −2 .github/workflows/ubuntu-noexcept-ci.yml
+0 −31 .github/workflows/ubuntu-oldclang-18-ci.yml
+2 −2 .github/workflows/ubuntu-sani-ci.yml
+4 −4 .github/workflows/ubuntu-sani-thread-ci.yml
+11 −7 .github/workflows/vs16-arm-ci.yml
+25 −17 .github/workflows/vs16-ci.yml
+1 −1 .github/workflows/vs17-arm-ci.yml
+2 −2 .github/workflows/vs17-ci.yml
+2 −2 .github/workflows/vs17-clang-ci.yml
+1 −0 .gitignore
+37 −28 CMakeLists.txt
+258 −69 README.md
+23 −10 amalgamation.sh
+1 −1 benchmarks/CMakeLists.txt
+1 −1 benchmarks/array_container_benchmark.c
+1 −1 benchmarks/bitset_container_benchmark.c
+1 −1 benchmarks/run_container_benchmark.c
+52 −0 benchmarks/sparse_cases_benchmark.cpp
+24 −0 cmake/CPM.cmake
+65 −0 cpp/roaring.hh
+32 −13 cpp/roaring64map.hh
+1 −1 doxygen
+23 −3 fuzz/croaring_fuzzer.c
+18 −2 include/roaring/array_util.h
+283 −0 include/roaring/bitset/bitset.h
+146 −14 include/roaring/bitset_util.h
+41 −4 include/roaring/containers/array.h
+22 −1 include/roaring/containers/bitset.h
+123 −78 include/roaring/containers/containers.h
+24 −15 include/roaring/containers/run.h
+29 −220 include/roaring/isadetection.h
+54 −26 include/roaring/misc/configreport.h
+192 −43 include/roaring/portability.h
+75 −11 include/roaring/roaring.h
+2 −0 include/roaring/roaring_array.h
+4 −4 include/roaring/roaring_version.h
+27 −0 microbenchmarks/CMakeLists.txt
+236 −0 microbenchmarks/bench.cpp
+247 −0 microbenchmarks/bench.h
+1,011 −0 microbenchmarks/performancecounters/apple_arm_events.h
+150 −0 microbenchmarks/performancecounters/event_counter.h
+917 −0 microbenchmarks/performancecounters/ibireme.h
+101 −0 microbenchmarks/performancecounters/linux-perf-events.h
+1,075 −0 microbenchmarks/toni_ronnko_dirent.h
+32 −13 src/CMakeLists.txt
+145 −22 src/array_util.c
+456 −0 src/bitset.c
+139 −21 src/bitset_util.c
+79 −19 src/containers/array.c
+421 −87 src/containers/bitset.c
+48 −12 src/containers/containers.c
+29 −7 src/containers/convert.c
+1 −1 src/containers/mixed_equal.c
+1 −1 src/containers/mixed_subset.c
+57 −1 src/containers/mixed_union.c
+13 −1 src/containers/mixed_xor.c
+164 −9 src/containers/run.c
+308 −0 src/isadetection.c
+404 −184 src/roaring.c
+27 −27 src/roaring_array.c
+12 −12 src/roaring_priority_queue.c
+21 −7 tests/CMakeLists.txt
+183 −0 tests/array_container_unit.c
+65 −6 tests/bitset_container_unit.c
+18 −1 tests/c_example1.c
+277 −0 tests/cbitset_unit.c
+2 −2 tests/container_comparison_unit.c
+76 −0 tests/cpp_unit.cpp
+0 −2 tests/mixed_container_unit.c
+6 −0 tests/test.h
+ tests/testdata/addoffsetinput.bin
+64 −0 tests/threads_unit.cpp
+827 −492 tests/toplevel_unit.c
+27 −7 tools/cmake/FindCTargets.cmake
+8 −11 tools/cmake/FindOptions.cmake
+24 −8 tools/cmake/Import.cmake
14 changes: 8 additions & 6 deletions contrib/croaring-cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/croaring")

set(SRCS
"${LIBRARY_DIR}/src/array_util.c"
"${LIBRARY_DIR}/src/bitset_util.c"
"${LIBRARY_DIR}/src/containers/array.c"
"${LIBRARY_DIR}/src/containers/bitset.c"
"${LIBRARY_DIR}/src/containers/containers.c"
"${LIBRARY_DIR}/src/containers/convert.c"
"${LIBRARY_DIR}/src/containers/mixed_intersection.c"
"${LIBRARY_DIR}/src/containers/mixed_union.c"
"${LIBRARY_DIR}/src/containers/mixed_andnot.c"
"${LIBRARY_DIR}/src/containers/mixed_equal.c"
"${LIBRARY_DIR}/src/containers/mixed_subset.c"
"${LIBRARY_DIR}/src/containers/mixed_intersection.c"
"${LIBRARY_DIR}/src/containers/mixed_negation.c"
"${LIBRARY_DIR}/src/containers/mixed_subset.c"
"${LIBRARY_DIR}/src/containers/mixed_union.c"
"${LIBRARY_DIR}/src/containers/mixed_xor.c"
"${LIBRARY_DIR}/src/containers/mixed_andnot.c"
"${LIBRARY_DIR}/src/containers/run.c"
"${LIBRARY_DIR}/src/array_util.c"
"${LIBRARY_DIR}/src/bitset_util.c"
"${LIBRARY_DIR}/src/bitset.c"
"${LIBRARY_DIR}/src/isadetection.c"
"${LIBRARY_DIR}/src/roaring.c"
"${LIBRARY_DIR}/src/roaring_priority_queue.c"
"${LIBRARY_DIR}/src/roaring_array.c"
Expand Down
5 changes: 3 additions & 2 deletions programs/client/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ set (CLICKHOUSE_CLIENT_LINK
string_utils
)

if (TARGET ch_rust::skim)
list(APPEND CLICKHOUSE_CLIENT_LINK PRIVATE ch_rust::skim)
# Need skim library in supercrate.
if (TARGET ch_rust::supercrate)
list(APPEND CLICKHOUSE_CLIENT_LINK PRIVATE ch_rust::supercrate)
endif()

# Always use internal readpassphrase
Expand Down
6 changes: 4 additions & 2 deletions programs/local/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ clickhouse_program_add(local)

target_link_libraries(clickhouse-local-lib PRIVATE clickhouse-server-lib)

if (TARGET ch_rust::skim)
target_link_libraries(clickhouse-local-lib PRIVATE ch_rust::skim)
# Need skim library in supercrate.
if (TARGET ch_rust::supercrate)
target_link_libraries(clickhouse-local-lib PRIVATE ch_rust::supercrate)
endif()

if (TARGET ch_contrib::azure_sdk)
target_link_libraries(clickhouse-local-lib PRIVATE ch_contrib::azure_sdk)
endif()
Expand Down
63 changes: 62 additions & 1 deletion programs/server/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@
#include <VectorIndex/Common/VIBuildMemoryUsageHelper.h>
#include <VectorIndex/Common/VICommon.h>

#if USE_TANTIVY_SEARCH
# include <tantivy_search.h>
#endif

#include "config.h"
#include "config_version.h"

Expand Down Expand Up @@ -452,6 +456,50 @@ void setOOMScore(int value, Poco::Logger * log)
}
#endif

extern "C" void tantivy_log_callback(int level, const char * thread_info, const char * message)
{
Poco::Logger & logger = Poco::Logger::get("TantivyLibrary");
switch (level)
{
case -2: // -2 -> fatal
LOG_FATAL(&logger, "{} - {}", thread_info, message);
break;
case -1: // -1 -> error
LOG_ERROR(&logger, "{} - {}", thread_info, message);
break;
case 0: // 0 -> warning
LOG_WARNING(&logger, "{} - {}", thread_info, message);
break;
case 1: // 1 -> info
LOG_INFO(&logger, "{} - {}", thread_info, message);
break;
case 2: // 2 -> debug
LOG_DEBUG(&logger, "{} - {}", thread_info, message);
break;
case 3: // 3 -> tracing
LOG_TRACE(&logger, "{} - {}", thread_info, message);
break;
default:
LOG_DEBUG(&logger, "{} - {}", thread_info, message);
}
}

void tantivy_log_integration(Poco::Util::AbstractConfiguration & config)
{
std::string tantivy_search_log_level = config.getString("logger.tantivy_search_log_level", config.getString("logger.level", "info"));

#if USE_TANTIVY_SEARCH
tantivy_search_log4rs_initialize_with_callback(
"", // Path for storing the `tantivy-search` log file.
tantivy_search_log_level.c_str(), // Sets the log level for `tantivy-search`, defaulting to the same log level as ClickHouse.
false, // `tantivy-search` log content will be recorded to a specific log file.
false, // Flag to control whether `tantivy-search` log messages are displayed in the console/terminal.
true, // If true, logs are exclusively recorded for `tantivy_search` and not for its submodule libraries (e.g., tantivy).
tantivy_log_callback // Logging `tantivy_search` logs using POCO LOG.
);
#endif
}


void Server::uninitialize()
{
Expand Down Expand Up @@ -483,7 +531,9 @@ void Server::initialize(Poco::Util::Application & self)
{
BaseDaemon::initialize(self);
logger().information("starting up");

#if USE_TANTIVY_SEARCH
tantivy_log_integration(config());
#endif
LOG_INFO(&logger(), "OS name: {}, version: {}, architecture: {}",
Poco::Environment::osName(),
Poco::Environment::osVersion(),
Expand Down Expand Up @@ -1100,6 +1150,14 @@ try
fs::create_directories(vector_index_cache_path);
}

{
#if USE_TANTIVY_SEARCH
std::string tantivy_index_cache_path = config().getString("tantivy_index_cache_path", path / "tantivy_index_cache/");
global_context->setTantivyIndexCachePath(tantivy_index_cache_path);
fs::create_directories(tantivy_index_cache_path);
#endif
}

/// top_level_domains_lists
{
const std::string & top_level_domains_path = config().getString("top_level_domains_path", path / "top_level_domains/");
Expand Down Expand Up @@ -1273,6 +1331,9 @@ try
// in a lot of places. For now, disable updating log configuration without server restart.
//setTextLog(global_context->getTextLog());
updateLevels(*config, logger());
#if USE_TANTIVY_SEARCH
tantivy_log_integration(*config);
#endif
global_context->setClustersConfig(config, has_zookeeper);
global_context->setMacros(std::make_unique<Macros>(*config, "macros", log));
global_context->setExternalAuthenticatorsConfig(*config);
Expand Down
3 changes: 0 additions & 3 deletions rust/BLAKE3/CMakeLists.txt

This file was deleted.

44 changes: 37 additions & 7 deletions rust/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,30 @@ macro(configure_rustc)
set(RUST_CFLAGS "${RUST_CFLAGS} --sysroot ${CMAKE_SYSROOT}")
endif()

if (USE_MUSL)
set(RUST_CXXFLAGS "${RUST_CXXFLAGS} -D_LIBCPP_HAS_MUSL_LIBC=1")
endif ()

if(CCACHE_EXECUTABLE MATCHES "/sccache$")
message(STATUS "Using RUSTC_WRAPPER: ${CCACHE_EXECUTABLE}")
set(RUSTCWRAPPER "rustc-wrapper = \"${CCACHE_EXECUTABLE}\"")
else()
set(RUSTCWRAPPER "")
endif()

set(RUSTFLAGS "[]")
set(RUST_CARGO_BUILD_STD "")
# For more info: https://doc.rust-lang.org/beta/unstable-book/compiler-flags/sanitizer.html#memorysanitizer
if (SANITIZE STREQUAL "memory")
set(RUST_CARGO_BUILD_STD "build-std = [\"std\", \"panic_abort\", \"core\", \"alloc\"]")
set(RUSTFLAGS "[\"-Zsanitizer=memory\", \"-Zsanitizer-memory-track-origins\"]")
endif()

message(STATUS "RUST_CFLAGS: ${RUST_CFLAGS}")
message(STATUS "RUST_CXXFLAGS: ${RUST_CXXFLAGS}")
message(STATUS "RUSTFLAGS: ${RUSTFLAGS}")
message(STATUS "RUST_CARGO_BUILD_STD: ${RUST_CARGO_BUILD_STD}")

# NOTE: requires RW access for the source dir
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/.cargo/config.toml.in" "${CMAKE_CURRENT_SOURCE_DIR}/.cargo/config.toml" @ONLY)
endmacro()
configure_rustc()

Expand All @@ -34,9 +53,20 @@ function(clickhouse_import_crate)
else()
set(CMAKE_CONFIGURATION_TYPES "${CMAKE_BUILD_TYPE};debug")
endif()
# NOTE: we may use LTO for rust too

corrosion_import_crate(NO_STD ${ARGN})
if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
set(profile "")
else()
if (ENABLE_THINLTO)
set(profile "release-thinlto")
else()
set(profile "release")
endif()
endif()

# Note, here --offline is not used, since on CI vendor archive is used, and
# passing --offline here will be inconvenient for local development.
corrosion_import_crate(NO_STD ${ARGN} PROFILE ${profile})
endfunction()

# Add crate from the build directory.
Expand All @@ -47,7 +77,6 @@ endfunction()
#
# And to avoid overlaps different builds for one source directory, crate will
# be copied from source directory to the binary directory.
file(COPY ".cargo" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
function(add_rust_subdirectory src)
set(dst "${CMAKE_CURRENT_BINARY_DIR}/${src}")
message(STATUS "Copy ${src} to ${dst}")
Expand All @@ -67,5 +96,6 @@ function(add_rust_subdirectory src)
VERBATIM)
endfunction()

add_rust_subdirectory (BLAKE3)
add_rust_subdirectory (skim)
# supercrate combines all the Rust libraries into one static library,
# which can avoid symbol conflicts that occur when ClickHouse links multiple Rust static libraries.
add_rust_subdirectory (supercrate)
48 changes: 0 additions & 48 deletions rust/skim/CMakeLists.txt

This file was deleted.

File renamed without changes.
Loading

0 comments on commit 300e872

Please sign in to comment.