Update README

fsaintjacques · Dec 15, 2019 · 3f08a61 · 3f08a61
1 parent 9a2c272
commit 3f08a61
Showing 1 changed file with 179 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -1,63 +1,75 @@
 # jitmap: Jitted bitmaps
 
+## Language
 
-# Language
+jitmap offers a small DSL language to evaluate bitwise operations on bitmaps.
+The language supports variables (named bitmap), empty/full literals, and basic
+operators (not, and, or xor).
 
-A tiny expression language supporting the following C binary operators:
- - *!*: not
- - *&*: and
- - *|*: or
- - *^*: xor
-Supported literals are:
- - *$[0-9]+*: index reference
- - *[A-Za-z0-9_]+*: named reference
+A query takes an expression and a list of bitmaps and excute the expression on
+the bitmaps resulting in an new bitmap.
 
+### Supported expressions
 
-## Examples
+ - Empty bitmap literal: `$0`
+ - Full bitmap literal: `$1`
+ - Variables (named bitmap): `[A-Za-z0-9_]+`, e.g. `country`, `color_red`
+ - Not: `!e`
+ - And: `e_1 & e_2`
+ - Or: `e_1 | e_2`
+ - Xor: `e_1 ^ e_2`
+
+### Examples
 ```
+# NOT(a)
 !a
+
+# a AND b
 a & b
-($0 & ($1 | $2) ^ !$1)
+
+# $empty AND (a OR b) XOR c
+($0 & (a | b) ^ c)
 ```
 
-# Generated assembly for query
+## Developing/Debugging
+
+### *jitmap-ir* tool
+
+The *jitmap-ir* tool takes an expression as first input argument and dumps the
+generated LLVM ir to stdout. By default, this will not use vectorized instruction.
 
 ```
-$ ninja && tools/jitmap-ir '($0 & ($1 & ($2 & ($3 & $4))))'
-ninja: no work to do.
-; ModuleID = 'query'
-source_filename = "($0 & ($1 & ($2 & ($3 & $4))))"
+# tools/jitmap-ir '(a & b & c & d | e ^ f)'
+; ModuleID = 'jitmap-ir-module'
+source_filename = "jitmap-ir-module"
 
 ; Function Attrs: argmemonly
-define void @query(i64* nocapture readonly %in, i64* nocapture readonly %in1, i64* nocapture readonly %in2, i64* nocapture readonly %in3, i64* nocapture readonly %in4, i64* nocapture %out) #0 {
+define void @query(i64* nocapture readonly %in, i64* nocapture readonly %in1, i64* nocapture readonly %in2, i64* nocapture readonly %in3, i64* nocapture readonly %in4, i64* nocapture readonly %in5, i64* nocapture %out) #0 {
 entry:
  br label %loop
 
 loop: ; preds = %loop, %entry
  %i = phi i64 [ 0, %entry ], [ %next_i, %loop ]
  %gep_0 = getelementptr inbounds i64, i64* %in, i64 %i
- %bitcast_0 = bitcast i64* %gep_0 to <4 x i64>*
- %load_0 = load <4 x i64>, <4 x i64>* %bitcast_0
+ %load_0 = load i64, i64* %gep_0
  %gep_1 = getelementptr inbounds i64, i64* %in1, i64 %i
- %bitcast_1 = bitcast i64* %gep_1 to <4 x i64>*
- %load_1 = load <4 x i64>, <4 x i64>* %bitcast_1
+ %load_1 = load i64, i64* %gep_1
  %gep_2 = getelementptr inbounds i64, i64* %in2, i64 %i
- %bitcast_2 = bitcast i64* %gep_2 to <4 x i64>*
- %load_2 = load <4 x i64>, <4 x i64>* %bitcast_2
+ %load_2 = load i64, i64* %gep_2
  %gep_3 = getelementptr inbounds i64, i64* %in3, i64 %i
- %bitcast_3 = bitcast i64* %gep_3 to <4 x i64>*
- %load_3 = load <4 x i64>, <4 x i64>* %bitcast_3
+ %load_3 = load i64, i64* %gep_3
  %gep_4 = getelementptr inbounds i64, i64* %in4, i64 %i
- %bitcast_4 = bitcast i64* %gep_4 to <4 x i64>*
- %load_4 = load <4 x i64>, <4 x i64>* %bitcast_4
- %0 = and <4 x i64> %load_3, %load_4
- %1 = and <4 x i64> %load_2, %0
- %2 = and <4 x i64> %load_1, %1
- %3 = and <4 x i64> %load_0, %2
+ %load_4 = load i64, i64* %gep_4
+ %gep_5 = getelementptr inbounds i64, i64* %in5, i64 %i
+ %load_5 = load i64, i64* %gep_5
+ %0 = and i64 %load_0, %load_1
+ %1 = and i64 %0, %load_2
+ %2 = and i64 %1, %load_3
+ %3 = xor i64 %load_4, %load_5
+ %4 = or i64 %2, %3
  %gep_output = getelementptr inbounds i64, i64* %out, i64 %i
- %bitcast_output = bitcast i64* %gep_output to <4 x i64>*
- store <4 x i64> %3, <4 x i64>* %bitcast_output
- %next_i = add i64 %i, 4
+ store i64 %4, i64* %gep_output
+ %next_i = add i64 %i, 1
  %exit_cond = icmp eq i64 %next_i, 1024
  br i1 %exit_cond, label %after_loop, label %loop
 
@@ -68,91 +80,179 @@ after_loop: ; preds = %loop
 attributes #0 = { argmemonly }
 ```
 
-Use LLVM's opt and/or llc to transform the IR into native assembly.
+We can then use LLVM's `llc` to transform the IR into native assembly.
 
 ```
-tools/jitmap-ir '($0 & ($1 & ($2 & ($3 & $4))))' | llc-8 -O3 -mcpu=core-avx2
+# tools/jitmap-ir '(a & b & c & d | e ^ f)' | llc-8 -O3 -mcpu=core-avx2
 ninja: no work to do.
  .text
- .file "($0 & ($1 & ($2 & ($3 & $4))))"
+ .file "jitmap-ir-module"
  .globl query # -- Begin function query
  .p2align 4, 0x90
  .type query,@function
 query: # @query
  .cfi_startproc
 # %bb.0: # %entry
+ pushq %rbx
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbx, -16
+ movq 16(%rsp), %r10
  xorl %eax, %eax
  .p2align 4, 0x90
 .LBB0_1: # %loop
  # =>This Inner Loop Header: Depth=1
- vmovaps (%rcx,%rax), %ymm0
- vandps (%r8,%rax), %ymm0, %ymm0
- vandps (%rdx,%rax), %ymm0, %ymm0
- vandps (%rsi,%rax), %ymm0, %ymm0
- vandps (%rdi,%rax), %ymm0, %ymm0
- vmovaps %ymm0, (%r9,%rax)
- addq $32, %rax
+ movq (%rdi,%rax), %r11
+ movq (%r8,%rax), %rbx
+ andq (%rsi,%rax), %r11
+ andq (%rdx,%rax), %r11
+ andq (%rcx,%rax), %r11
+ xorq (%r9,%rax), %rbx
+ orq %r11, %rbx
+ movq %rbx, (%r10,%rax)
+ addq $8, %rax
  cmpq $8192, %rax # imm = 0x2000
  jne .LBB0_1
 # %bb.2: # %after_loop
- vzeroupper
+ popq %rbx
+ .cfi_def_cfa_offset 8
  retq
 .Lfunc_end0:
  .size query, .Lfunc_end0-query
  .cfi_endproc
  # -- End function
 
  .section ".note.GNU-stack","",@progbits
+
 ```
 
+This code is still not fully optimized, `opt` is used for this.
+
 ```
-$ ninja && tools/jitmap-ir '$0 ^ $1' | opt-8 -S -O3 -mcpu=core-avx2 -mtriple=x86_64-pc-linux-gnu | llc-8 -O3 -mcpu=core-avx2
-ninja: no work to do.
+# tools/jitmap-ir '(a & b & c & d | e ^ f)' | opt-8 -S -O3 -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu | llc-8 -O3 -mcpu=core-avx2
  .text
- .file "($0 ^ $1)"
- .globl query # -- Begin function query
+ .file "jitmap-ir-module"
+ .section .rodata.cst8,"aM",@progbits,8
+ .p2align 3 # -- Begin function query
+.LCPI0_0:
+ .quad 8192 # 0x2000
+.LCPI0_1:
+ .quad -9223372036854775808 # 0x8000000000000000
+ .text
+ .globl query
  .p2align 4, 0x90
  .type query,@function
 query: # @query
 # %bb.0: # %entry
+ pushq %rbp
+ pushq %r14
+ pushq %rbx
+ movq 32(%rsp), %r10
+ leaq 8192(%r10), %rbp
+ vmovq %rcx, %xmm0
+ vmovq %rdx, %xmm1
+ vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
+ vmovq %rsi, %xmm1
+ vmovq %rdi, %xmm2
+ vpunpcklqdq %xmm1, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm1[0]
+ vinserti128 $1, %xmm0, %ymm1, %ymm0
+ vpbroadcastq .LCPI0_0(%rip), %ymm1 # ymm1 = [8192,8192,8192,8192]
+ vpaddq %ymm1, %ymm0, %ymm1
+ leaq 8192(%r8), %rax
+ leaq 8192(%r9), %r11
+ vmovq %r10, %xmm2
+ vpbroadcastq %xmm2, %ymm2
+ vpbroadcastq .LCPI0_1(%rip), %ymm3 # ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+ vpxor %ymm3, %ymm1, %ymm1
+ vpxor %ymm3, %ymm2, %ymm2
+ cmpq %r10, %rax
+ seta %al
+ vpcmpgtq %ymm2, %ymm1, %ymm1
+ cmpq %r8, %rbp
+ seta %bl
+ cmpq %r10, %r11
+ seta %r11b
+ cmpq %r9, %rbp
+ vmovq %rbp, %xmm2
+ vpbroadcastq %xmm2, %ymm2
+ vpxor %ymm3, %ymm0, %ymm0
+ vpxor %ymm3, %ymm2, %ymm2
+ vpcmpgtq %ymm0, %ymm2, %ymm0
+ vpand %ymm0, %ymm1, %ymm0
+ vextracti128 $1, %ymm0, %xmm1
+ seta %r14b
+ vpackssdw %xmm1, %xmm0, %xmm2
+ vpackssdw %xmm0, %xmm1, %xmm0
+ vpor %xmm0, %xmm2, %xmm0
+ vpshufd $229, %xmm0, %xmm1 # xmm1 = xmm0[1,1,2,3]
+ vpor %xmm1, %xmm0, %xmm0
+ vpextrb $0, %xmm0, %ebp
+ testb $1, %bpl
+ jne .LBB0_5
+# %bb.1: # %entry
+ andb %bl, %al
+ jne .LBB0_5
+# %bb.2: # %entry
+ andb %r14b, %r11b
+ jne .LBB0_5
+# %bb.3: # %vector.body.preheader
  xorl %eax, %eax
  .p2align 4, 0x90
-.LBB0_1: # %loop
+.LBB0_4: # %vector.body
  # =>This Inner Loop Header: Depth=1
- vmovaps (%rsi,%rax,8), %ymm0
- vxorps (%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, (%rdx,%rax,8)
- vmovaps 32(%rsi,%rax,8), %ymm0
- vxorps 32(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 32(%rdx,%rax,8)
- vmovaps 64(%rsi,%rax,8), %ymm0
- vxorps 64(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 64(%rdx,%rax,8)
- vmovaps 96(%rsi,%rax,8), %ymm0
- vxorps 96(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 96(%rdx,%rax,8)
- vmovaps 128(%rsi,%rax,8), %ymm0
- vxorps 128(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 128(%rdx,%rax,8)
- vmovaps 160(%rsi,%rax,8), %ymm0
- vxorps 160(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 160(%rdx,%rax,8)
- vmovaps 192(%rsi,%rax,8), %ymm0
- vxorps 192(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 192(%rdx,%rax,8)
- vmovaps 224(%rsi,%rax,8), %ymm0
- vxorps 224(%rdi,%rax,8), %ymm0, %ymm0
- vmovaps %ymm0, 224(%rdx,%rax,8)
- addq $32, %rax
- cmpq $1024, %rax # imm = 0x400
- jne .LBB0_1
-# %bb.2: # %after_loop
+ vmovdqu (%rsi,%rax), %ymm0
+ vmovdqu 32(%rsi,%rax), %ymm1
+ vmovdqu (%r9,%rax), %ymm2
+ vmovdqu 32(%r9,%rax), %ymm3
+ vpand (%rdi,%rax), %ymm0, %ymm0
+ vpand 32(%rdi,%rax), %ymm1, %ymm1
+ vpand (%rdx,%rax), %ymm0, %ymm0
+ vpand 32(%rdx,%rax), %ymm1, %ymm1
+ vpand (%rcx,%rax), %ymm0, %ymm0
+ vpand 32(%rcx,%rax), %ymm1, %ymm1
+ vpxor (%r8,%rax), %ymm2, %ymm2
+ vpxor 32(%r8,%rax), %ymm3, %ymm3
+ vpor %ymm0, %ymm2, %ymm0
+ vpor %ymm1, %ymm3, %ymm1
+ vmovdqu %ymm0, (%r10,%rax)
+ vmovdqu %ymm1, 32(%r10,%rax)
+ addq $64, %rax
+ cmpq $8192, %rax # imm = 0x2000
+ jne .LBB0_4
+ jmp .LBB0_7
+.LBB0_5: # %loop.preheader
+ xorl %eax, %eax
+ .p2align 4, 0x90
+.LBB0_6: # %loop
+ # =>This Inner Loop Header: Depth=1
+ movq (%rsi,%rax), %rbp
+ movq (%r9,%rax), %rbx
+ andq (%rdi,%rax), %rbp
+ andq (%rdx,%rax), %rbp
+ andq (%rcx,%rax), %rbp
+ xorq (%r8,%rax), %rbx
+ orq %rbp, %rbx
+ movq %rbx, (%r10,%rax)
+ movq 8(%rsi,%rax), %rbp
+ movq 8(%r9,%rax), %rbx
+ andq 8(%rdi,%rax), %rbp
+ andq 8(%rdx,%rax), %rbp
+ andq 8(%rcx,%rax), %rbp
+ xorq 8(%r8,%rax), %rbx
+ orq %rbp, %rbx
+ movq %rbx, 8(%r10,%rax)
+ addq $16, %rax
+ cmpq $8192, %rax # imm = 0x2000
+ jne .LBB0_6
+.LBB0_7: # %after_loop
+ popq %rbx
+ popq %r14
+ popq %rbp
  vzeroupper
  retq
 .Lfunc_end0:
  .size query, .Lfunc_end0-query
  # -- End function
 
  .section ".note.GNU-stack","",@progbits
-```
 
+```