Skip to content

Commit

Permalink
Add @simd support.
Browse files Browse the repository at this point in the history
This is a collective commit that supercedes the earlier commits for @simd.
  • Loading branch information
Arch D. Robison committed Mar 26, 2014
1 parent eaef4a1 commit ba1b05a
Show file tree
Hide file tree
Showing 17 changed files with 435 additions and 40 deletions.
3 changes: 2 additions & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1319,4 +1319,5 @@ export
@sprintf,
@deprecate,
@boundscheck,
@inbounds
@inbounds,
@simd
66 changes: 66 additions & 0 deletions base/simdloop.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Support for @simd for

module SimdLoop

export @simd

# Error thrown from ill-formed uses of @simd
type SimdError <: Exception
msg::ASCIIString
end

# Parse colon expression low:high, returning (low,high)
function parse_range( x::Expr )
if x.head!=:(:)
throw( SimdError("range must use : syntax"))
elseif length(x.args)!=2
throw( SimdError("wrong number of args in range"));
else
(x.args[1],x.args[2])
end
end

# Parse iteration space expression
# symbol '=' range
# symbol 'in' range
function parse_iteration_space( x::Expr )
if x.head!=:(=) && x.head!=:(in)
throw( SimdError("= or in expected"))
elseif length(x.args)!=2
throw( SimdError("simd syntax error"))
else
sym = (x.args[1])::Symbol
(low,high)=parse_range(x.args[2])
return (sym,low,high)
end
end

# Compile Expr x in context of @simd.
function compile(x::Expr)
h = x.head
if h != :for
throw(SimdError("for loop expected"))
elseif length(x.args)!=2
throw(SimdError("1D for loop expected"))
else
(var,low,high) = parse_iteration_space(x.args[1])
tmp = gensym()
body = x.args[2]
loop = quote
local $tmp = $high+1
local $var = $low
while $var < $tmp
$body
$var = $var+1
$(Expr(:simdloop)) # Mark loop as SIMD loop
end
end
return loop
end
end

macro simd(forloop)
esc(compile(forloop))
end

end # simdloop
4 changes: 4 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ using .I18n
using .Help
push!(I18n.CALLBACKS, Help.clear_cache)

# SIMD loops
include("simdloop.jl")
importall .SimdLoop

# sparse matrices and linear algebra
include("sparse.jl")
importall .SparseMatrix
Expand Down
41 changes: 41 additions & 0 deletions doc/manual/performance-tips.rst
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,47 @@ These are some minor points that might help in tight inner loops.
- Use ``div(x,y)`` for truncating division of integers instead of
``trunc(x/y)``, and ``fld(x,y)`` instead of ``floor(x/y)``.

Performance Annotations
-----------------------

Sometimes you can enable better optimization by promising certain program
properties.

- Use ``@inbounds`` to eliminate array bounds checking within expressions.
Be certain before doing this. If the subscripts are ever out of bounds,
you may suffer crashes or silent corruption.
- Write ``@simd`` in front of ``for`` loops that are amenable to vectorization.

Here is an example with both forms of markup::

function tightloop( x, y, z )
s = zero(eltype(z))
n = min(length(x),length(y),length(z))
@simd for i in 1:n
@inbounds begin
z[i] = x[i]-y[i]
s += z[i]*z[i]
end
end
s
end

The range for a ``@simd for`` loop must use the colon syntax, with two subexpresions.
A variable used for accumulating, such as ``s`` in the example, is called
a *reduction variable*. By using``@simd``, you are asserting several
properties of the loop:

- It is safe to execute iterations in arbitrary or overlapping order,
with special consideration for reduction variables.
- Floating-point operations on reduction variables can be reordered,
possibly causing different results than without ``@simd``.
- No iteration ever waits on another iteration to make forward progress.

Using ``@simd`` merely gives the compiler license to vectorize. Whether
it actually does so depends on the compiler. The current implementation
will not vectorize if there are possible early exits from the loop, such
as from array bounds checking. This limitation may be lifted in the future.

Tools
-----

Expand Down
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ override CPPFLAGS += $(JCPPFLAGS)

SRCS = \
jltypes gf ast builtins module codegen interpreter \
alloc dlload sys init task array dump toplevel jl_uv jlapi profile
alloc dlload sys init task array dump toplevel jl_uv jlapi profile llvm-simdloop

FLAGS = \
-D_GNU_SOURCE \
Expand Down
1 change: 1 addition & 0 deletions src/Windows.mk
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ OBJECTS = \
toplevel.obj \
jl_uv.obj \
jlapi.obj \
llvm-simdloop.obj \
gc.obj

LIBFLISP = flisp\libflisp.lib
Expand Down
1 change: 1 addition & 0 deletions src/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ jl_sym_t *compositetype_sym; jl_sym_t *type_goto_sym;
jl_sym_t *global_sym; jl_sym_t *tuple_sym;
jl_sym_t *dot_sym; jl_sym_t *newvar_sym;
jl_sym_t *boundscheck_sym; jl_sym_t *copyast_sym;
jl_sym_t *simdloop_sym;

typedef struct {
int64_t a;
Expand Down
3 changes: 2 additions & 1 deletion src/ccall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ static Value *julia_to_native(Type *ty, jl_value_t *jt, Value *jv,
return builder.CreateBitCast(emit_arrayptr(jv), ty);
}
if (aty == (jl_value_t*)jl_ascii_string_type || aty == (jl_value_t*)jl_utf8_string_type) {
return builder.CreateBitCast(emit_arrayptr(emit_nthptr(jv,1)), ty);
// FIXME - is tbaa_arrayptr correct here?
return builder.CreateBitCast(emit_arrayptr(emit_nthptr(jv,1,tbaa_arrayptr)), ty);
}
if (jl_is_structtype(aty) && jl_is_leaf_type(aty) && !jl_is_array_type(aty)) {
if (!addressOf) {
Expand Down
49 changes: 27 additions & 22 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,11 @@ static Value *mark_julia_type(Value *v, jl_value_t *jt)
return v;
}

static Value *tbaa_decorate(MDNode* md, Instruction* load_or_store) {
load_or_store->setMetadata( llvm::LLVMContext::MD_tbaa, md );
return load_or_store;
}

// --- generating various error checks ---

static jl_value_t *llvm_type_to_julia(Type *t, bool err=true);
Expand Down Expand Up @@ -733,7 +738,7 @@ static void raise_exception_unless(Value *cond, Value *exc, jl_codectx_t *ctx)
static void raise_exception_unless(Value *cond, GlobalVariable *exc,
jl_codectx_t *ctx)
{
raise_exception_unless(cond, (Value*)builder.CreateLoad(exc, false), ctx);
raise_exception_unless(cond, (Value*)tbaa_decorate(tbaa_const,builder.CreateLoad(exc, false)), ctx);
}

static void raise_exception_if(Value *cond, Value *exc, jl_codectx_t *ctx)
Expand Down Expand Up @@ -839,11 +844,11 @@ static Value *emit_nthptr_addr(Value *v, Value *idx)
return builder.CreateGEP(builder.CreateBitCast(v, jl_ppvalue_llvmt), idx);
}

static Value *emit_nthptr(Value *v, size_t n)
static Value *emit_nthptr(Value *v, size_t n, MDNode *tbaa)
{
// p = (jl_value_t**)v; p[n]
Value *vptr = emit_nthptr_addr(v, n);
return builder.CreateLoad(vptr, false);
return tbaa_decorate(tbaa,builder.CreateLoad(vptr, false));
}

static Value *emit_nthptr(Value *v, Value *idx)
Expand All @@ -865,7 +870,7 @@ static Value *typed_load(Value *ptr, Value *idx_0based, jl_value_t *jltype,
data = builder.CreateBitCast(ptr, PointerType::get(elty, 0));
else
data = ptr;
Value *elt = builder.CreateLoad(builder.CreateGEP(data, idx_0based), false);
Value *elt = tbaa_decorate(tbaa_user, builder.CreateLoad(builder.CreateGEP(data, idx_0based), false));
if (elty == jl_pvalue_llvmt) {
null_pointer_check(elt, ctx);
}
Expand All @@ -891,7 +896,7 @@ static Value *typed_store(Value *ptr, Value *idx_0based, Value *rhs,
data = builder.CreateBitCast(ptr, PointerType::get(elty, 0));
else
data = ptr;
return builder.CreateStore(rhs, builder.CreateGEP(data, idx_0based));
return tbaa_decorate(tbaa_user, builder.CreateStore(rhs, builder.CreateGEP(data, idx_0based)));
}

// --- convert boolean value to julia ---
Expand Down Expand Up @@ -988,7 +993,7 @@ static Value *emit_tuplelen(Value *t,jl_value_t *jt)
return builder.CreateLShr(builder.CreatePtrToInt(lenbits, T_int64),
ConstantInt::get(T_int32, 52));
#else
Value *lenbits = emit_nthptr(t, 1);
Value *lenbits = emit_nthptr(t, 1, tbaa_tuplelen);
return builder.CreatePtrToInt(lenbits, T_size);
#endif
}
Expand Down Expand Up @@ -1120,20 +1125,20 @@ static Value *emit_tupleref(Value *tuple, Value *ival, jl_value_t *jt, jl_codect
Intrinsic::stacksave));
builder.Insert(stacksave);
Value *tempSpace = builder.CreateAlloca(at);
builder.CreateStore(tuple,tempSpace);
tbaa_decorate(tbaa_user, builder.CreateStore(tuple,tempSpace));
Value *idxs[2];
idxs[0] = ConstantInt::get(T_size,0);
idxs[1] = builder.CreateSub(ival,ConstantInt::get(T_size,1));
Value *v = builder.CreateGEP(tempSpace,ArrayRef<Value*>(&idxs[0],2));
if (idx) {
v = mark_julia_type(builder.CreateLoad(v), jl_tupleref(jt,ci));
v = mark_julia_type(tbaa_decorate(tbaa_user, builder.CreateLoad(v)), jl_tupleref(jt,ci));
}
else {
jl_add_linfo_root(ctx->linfo, jt);
Value *lty = emit_tupleref(literal_pointer_val(jt), ival, jl_typeof(jt), ctx);
size_t i, l = jl_tuple_len(jt);
if (is_tupletype_homogeneous((jl_tuple_t*)jt) && jl_isbits(jl_t0(jt))) {
v = mark_julia_type(builder.CreateLoad(v), jl_t0(jt));
v = mark_julia_type(tbaa_decorate(tbaa_user, builder.CreateLoad(v)), jl_t0(jt));
}
else {
for (i = 0; i < l; i++) {
Expand All @@ -1147,7 +1152,7 @@ static Value *emit_tupleref(Value *tuple, Value *ival, jl_value_t *jt, jl_codect
Value *nb = ConstantExpr::getSizeOf(at->getElementType());
if (sizeof(size_t)==4)
nb = builder.CreateTrunc(nb, T_int32);
v = allocate_box_dynamic(lty, nb, builder.CreateLoad(v));
v = allocate_box_dynamic(lty, nb, tbaa_decorate(tbaa_user, builder.CreateLoad(v)));
}
}
}
Expand Down Expand Up @@ -1254,7 +1259,7 @@ static Value *emit_arraylen_prim(Value *t, jl_value_t *ty)
{
#ifdef STORE_ARRAY_LEN
(void)ty;
Value *lenbits = emit_nthptr(t, 2);
Value *lenbits = emit_nthptr(t, 2, tbaa_arraylen);
return builder.CreatePtrToInt(lenbits, T_size);
#else
jl_value_t *p1 = jl_tparam1(ty);
Expand All @@ -1280,39 +1285,39 @@ static Value *emit_arraylen(Value *t, jl_value_t *ex, jl_codectx_t *ctx)
{
jl_arrayvar_t *av = arrayvar_for(ex, ctx);
if (av!=NULL)
return builder.CreateLoad(av->len);
return tbaa_decorate(tbaa_arraylen, builder.CreateLoad(av->len));
return emit_arraylen_prim(t, expr_type(ex,ctx));
}

static Value *emit_arrayptr(Value *t)
{
return emit_nthptr(t, 1);
return emit_nthptr(t, 1, tbaa_arrayptr);
}

static Value *emit_arrayptr(Value *t, jl_value_t *ex, jl_codectx_t *ctx)
{
jl_arrayvar_t *av = arrayvar_for(ex, ctx);
if (av!=NULL)
return builder.CreateLoad(av->dataptr);
return tbaa_decorate(tbaa_arrayptr, builder.CreateLoad(av->dataptr));
return emit_arrayptr(t);
}

static Value *emit_arraysize(Value *t, jl_value_t *ex, int dim, jl_codectx_t *ctx)
{
jl_arrayvar_t *av = arrayvar_for(ex, ctx);
if (av != NULL && dim <= (int)av->sizes.size())
return builder.CreateLoad(av->sizes[dim-1]);
return tbaa_decorate(tbaa_arraysize, builder.CreateLoad(av->sizes[dim-1]));
return emit_arraysize(t, dim);
}

static void assign_arrayvar(jl_arrayvar_t &av, Value *ar)
{
builder.CreateStore(builder.CreateBitCast(emit_arrayptr(ar),
av.dataptr->getType()->getContainedType(0)),
av.dataptr);
builder.CreateStore(emit_arraylen_prim(ar, av.ty), av.len);
tbaa_decorate(tbaa_arrayptr,builder.CreateStore(builder.CreateBitCast(emit_arrayptr(ar),
av.dataptr->getType()->getContainedType(0)),
av.dataptr));
tbaa_decorate(tbaa_arraylen,builder.CreateStore(emit_arraylen_prim(ar, av.ty), av.len));
for(size_t i=0; i < av.sizes.size(); i++)
builder.CreateStore(emit_arraysize(ar,i+1), av.sizes[i]);
tbaa_decorate(tbaa_user, builder.CreateStore(emit_arraysize(ar,i+1), av.sizes[i]));
}

static Value *data_pointer(Value *x)
Expand Down Expand Up @@ -1363,7 +1368,7 @@ static Value *emit_array_nd_index(Value *a, jl_value_t *ex, size_t nd, jl_value_

ctx->f->getBasicBlockList().push_back(failBB);
builder.SetInsertPoint(failBB);
builder.CreateCall2(prepare_call(jlthrow_line_func), builder.CreateLoad(prepare_global(jlboundserr_var)),
builder.CreateCall2(prepare_call(jlthrow_line_func), tbaa_decorate(tbaa_const,builder.CreateLoad(prepare_global(jlboundserr_var))),
ConstantInt::get(T_int32, ctx->lineno));
builder.CreateUnreachable();

Expand Down Expand Up @@ -1585,7 +1590,7 @@ static void emit_cpointercheck(Value *x, const std::string &msg,
emit_typecheck(t, (jl_value_t*)jl_datatype_type, msg, ctx);

Value *istype =
builder.CreateICmpEQ(emit_nthptr(t, offsetof(jl_datatype_t,name)/sizeof(char*)),
builder.CreateICmpEQ(emit_nthptr(t, offsetof(jl_datatype_t,name)/sizeof(char*), tbaa_datatype),
literal_pointer_val((jl_value_t*)jl_pointer_type->name));
BasicBlock *failBB = BasicBlock::Create(getGlobalContext(),"fail",ctx->f);
BasicBlock *passBB = BasicBlock::Create(getGlobalContext(),"pass");
Expand Down
Loading

0 comments on commit ba1b05a

Please sign in to comment.