jitmap is a small library providing an execution engine evaluating logical binary expressions on bitmaps. Some examples:
-
In search engines, posting lists are often represented by sequence of integers, or bitmaps. Evaluating a search term (logical expression on keywords) can be implemented by a logical expression on bitmaps.
-
In columnar databases, bitmaps are used to represent selection vectors, themselves the intermediary representation of the results of predicate filter on rows. The bitmaps are then combined in a final bitmap, the predicate expression transformed in a logical expression on bitmaps.
-
In stream system with rule engines, e.g. adtech bid filtering on campaign rules, bitmaps are used as a first-pass optimization to lower the number of campaigns' rules to evaluate on each incoming event.
jitmap compiles logical expresion into native functions that takes multiple
inputs dense bitmaps pointers, e.g. const char**
, and write the result in a
destination pointer, e.g. const char*
. The signature of such generated
functions are void fn(const char**, char*)
. The functions are then callable
in the same address space.
The following snippet shows an example of what jitmap achieves:
typedef void (*dense_eval_fn)(const char**, char*);
dense_eval_fn a_and_b = jitmap_compile("a_and_b", "a ^ b");
const char** addrs[2] = {&a, &b};
// `output` will now contains the result of `a & b` applied vertically likely
// using // vectorized instruction available on the host running this code, as
// opposed to where the code was compiled.
a_and_b(addrs, &output);
jitmap offers a small DSL language to evaluate bitwise operations on bitmaps. The language supports variables (named bitmap), empty/full literals, and basic operators (not, and, or xor).
A query takes an expression and a list of bitmaps and excute the expression on the bitmaps resulting in an new bitmap.
- Empty bitmap literal:
$0
- Full bitmap literal:
$1
- Variables (named bitmap):
[A-Za-z0-9_]+
, e.g.country
,color_red
- Not:
!e
- And:
e_1 & e_2
- Or:
e_1 | e_2
- Xor:
e_1 ^ e_2
# NOT(a)
!a
# a AND b
a & b
# $empty AND (a OR b) XOR c
($0 & (a | b) ^ c)
The jitmap-ir binary takes an expression as first input argument and dumps the generated LLVM' IR to stdout. It is useful to debug and peek at the generated code. Using LLVM command line utilies, we can also look at the expected generated assembly for any platform.
# tools/jitmap-ir "(a & b) | (c & c) | (c ^ d) | (c & b) | (d ^ a)"
; ModuleID = 'jitmap_ir'
source_filename = "jitmap_ir"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: argmemonly
define void @query(i32** nocapture readonly %inputs, i32* nocapture %output) #0 {
entry:
%bitmap_gep_0 = getelementptr inbounds i32*, i32** %inputs, i64 0
%bitmap_0 = load i32*, i32** %bitmap_gep_0
%bitmap_gep_1 = getelementptr inbounds i32*, i32** %inputs, i64 1
%bitmap_1 = load i32*, i32** %bitmap_gep_1
%bitmap_gep_2 = getelementptr inbounds i32*, i32** %inputs, i64 2
%bitmap_2 = load i32*, i32** %bitmap_gep_2
%bitmap_gep_3 = getelementptr inbounds i32*, i32** %inputs, i64 3
%bitmap_3 = load i32*, i32** %bitmap_gep_3
br label %loop
loop: ; preds = %loop, %entry
%i = phi i64 [ 0, %entry ], [ %next_i, %loop ]
%gep_0 = getelementptr inbounds i32, i32* %bitmap_0, i64 %i
%load_0 = load i32, i32* %gep_0
%gep_1 = getelementptr inbounds i32, i32* %bitmap_1, i64 %i
%load_1 = load i32, i32* %gep_1
%gep_2 = getelementptr inbounds i32, i32* %bitmap_2, i64 %i
%load_2 = load i32, i32* %gep_2
%gep_3 = getelementptr inbounds i32, i32* %bitmap_3, i64 %i
%load_3 = load i32, i32* %gep_3
%0 = and i32 %load_0, %load_1
%1 = and i32 %load_2, %load_2
%2 = or i32 %0, %1
%3 = xor i32 %load_2, %load_3
%4 = or i32 %2, %3
%5 = and i32 %load_2, %load_1
%6 = or i32 %4, %5
%7 = xor i32 %load_3, %load_0
%8 = or i32 %6, %7
%gep_output = getelementptr inbounds i32, i32* %output, i64 %i
store i32 %8, i32* %gep_output
%next_i = add i64 %i, 1
%exit_cond = icmp eq i64 %next_i, 2048
br i1 %exit_cond, label %after_loop, label %loop
after_loop: ; preds = %loop
ret void
}
attributes #0 = { argmemonly }
We can then use LLVM's opt
and llc
to transform the IR into native assembly.
# tools/jitmap-ir "(a & b) | (c & c) | (c ^ d) | (c & b) | (d ^ a)" | llc -O3
.text
.file "jitmap_ir"
.globl query # -- Begin function query
.p2align 4, 0x90
.type query,@function
query: # @query
.cfi_startproc
# %bb.0: # %entry
pushq %rbp
.cfi_def_cfa_offset 16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset %rbx, -24
.cfi_offset %rbp, -16
movq (%rdi), %r8
movq 8(%rdi), %r9
movq 16(%rdi), %r10
movq 24(%rdi), %r11
movq $-8192, %rax # imm = 0xE000
.p2align 4, 0x90
.LBB0_1: # %loop
# =>This Inner Loop Header: Depth=1
movl 8192(%r8,%rax), %ecx
movl 8192(%r9,%rax), %edx
movl 8192(%r10,%rax), %edi
movl 8192(%r11,%rax), %ebx
movl %edi, %ebp
xorl %ebx, %ebp
xorl %ecx, %ebx
andl %edx, %ecx
orl %edi, %ebp
andl %edx, %edi
orl %ebp, %edi
orl %edi, %ebx
orl %ecx, %ebx
movl %ebx, 8192(%rsi,%rax)
addq $4, %rax
jne .LBB0_1
# %bb.2: # %after_loop
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size query, .Lfunc_end0-query
.cfi_endproc
# -- End function
.section ".note.GNU-stack","",@progbits
This code is still not fully optimized, opt
is used for this.
# tools/jitmap-ir "(a & b) | (c & c) | (c ^ d) | (c & b) | (d ^ a)" | opt -O3 -S -mcpu=core-avx2| llc -O3
ninja: no work to do.
.text
.file "jitmap_ir"
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function query
.LCPI0_0:
.quad 8192 # 0x2000
.LCPI0_1:
.quad -9223372036854775808 # 0x8000000000000000
.text
.globl query
.p2align 4, 0x90
.type query,@function
query: # @query
# %bb.0: # %entry
pushq %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
# ...
# And the holy grail fully vectorized loop
.LBB0_2: # %vector.body
# =>This Inner Loop Header: Depth=1
vmovdqu (%r14,%rbx), %ymm0
vmovdqu 32(%r14,%rbx), %ymm1
vmovdqu (%r12,%rbx), %ymm2
vmovdqu 32(%r12,%rbx), %ymm3
vmovdqu (%rdi,%rbx), %ymm4
vmovdqu 32(%rdi,%rbx), %ymm5
vpand (%r15,%rbx), %ymm0, %ymm6
vpand 32(%r15,%rbx), %ymm1, %ymm7
vpor %ymm2, %ymm6, %ymm6
vpor %ymm3, %ymm7, %ymm7
vpxor %ymm2, %ymm4, %ymm2
vpxor %ymm3, %ymm5, %ymm3
vpxor %ymm0, %ymm4, %ymm0
vpor %ymm0, %ymm2, %ymm0
vpor %ymm0, %ymm6, %ymm0
vpxor %ymm1, %ymm5, %ymm1
vpor %ymm1, %ymm3, %ymm1
vpor %ymm1, %ymm7, %ymm1
vmovdqu %ymm0, (%rsi,%rbx)
vmovdqu %ymm1, 32(%rsi,%rbx)
addq $64, %rbx
cmpq $8192, %rbx # imm = 0x2000
jne .LBB0_2
.LBB0_5: # %after_loop
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.Lfunc_end0:
.size query, .Lfunc_end0-query
# -- End function
.section ".note.GNU-stack","",@progbits