Skip to content

Commit

Permalink
Optimize sysimg TLS access on Linux
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyichao committed May 11, 2017
1 parent 3603adc commit 0c89c16
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 79 deletions.
12 changes: 7 additions & 5 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,8 @@ static Function *jltls_states_func;
static GlobalVariable *jltls_states_var;
#else
// Imaging mode only
static GlobalVariable *jltls_states_func_ptr = NULL;
size_t jltls_states_func_idx = 0;
size_t jltls_offset_idx = 0;
#endif

// important functions
Expand Down Expand Up @@ -6621,10 +6621,12 @@ static void init_julia_llvm_env(Module *m)
add_named_global(jltls_states_func, jl_get_ptls_states_getter());
if (imaging_mode) {
PointerType *pfunctype = jltls_states_func->getFunctionType()->getPointerTo();
jltls_states_func_ptr =
jl_emit_sysimg_slot(m, pfunctype, "jl_get_ptls_states.ptr",
(uintptr_t)jl_get_ptls_states_getter(),
jltls_states_func_idx);
jl_emit_sysimg_slot(m, pfunctype, "jl_get_ptls_states.ptr",
(uintptr_t)jl_get_ptls_states_getter(),
jltls_states_func_idx);
jl_emit_sysimg_slot(m, T_size, "jl_tls_offset.val",
(uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset),
jltls_offset_idx);
}
#endif

Expand Down
4 changes: 4 additions & 0 deletions src/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ static void jl_load_sysimg_so(void)
"jl_ptls_states_getter_idx");
*sysimg_gvars[tls_getter_idx - 1] =
(jl_value_t*)jl_get_ptls_states_getter();
size_t tls_offset_idx = *(size_t*)jl_dlsym(jl_sysimg_handle,
"jl_tls_offset_idx");
*sysimg_gvars[tls_offset_idx - 1] =
(jl_value_t*)(uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset);
#endif
const char *cpu_target = (const char*)jl_dlsym(jl_sysimg_handle, "jl_sysimg_cpu_target");
if (strcmp(cpu_target,jl_options.cpu_target) != 0)
Expand Down
10 changes: 8 additions & 2 deletions src/jitlayers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,15 +1019,15 @@ void* jl_emit_and_add_to_shadow(GlobalVariable *gv, void *gvarinit)
// Use as an optimization for runtime constant addresses to have one less
// load. (Used only by threading).
GlobalVariable *jl_emit_sysimg_slot(Module *m, Type *typ, const char *name,
uintptr_t init, size_t &idx)
uintptr_t init, size_t &idx)
{
assert(imaging_mode);
// This is **NOT** a external variable or a normal global variable
// This is a special internal global slot with a special index
// in the global variable table.
GlobalVariable *gv = new GlobalVariable(*m, typ, false,
GlobalVariable::InternalLinkage,
ConstantPointerNull::get((PointerType*)typ), name);
Constant::getNullValue(typ), name);
addComdat(gv);
// make the pointer valid for this session
#if defined(USE_MCJIT) || defined(USE_ORCJIT)
Expand Down Expand Up @@ -1111,6 +1111,12 @@ static void jl_gen_llvm_globaldata(llvm::Module *mod, ValueToValueMapTy &VMap,
GlobalVariable::ExternalLinkage,
ConstantInt::get(T_size, jltls_states_func_idx),
"jl_ptls_states_getter_idx"));
addComdat(new GlobalVariable(*mod,
T_size,
true,
GlobalVariable::ExternalLinkage,
ConstantInt::get(T_size, jltls_offset_idx),
"jl_tls_offset_idx"));
#endif

Constant *feature_string = ConstantDataArray::getString(jl_LLVMContext, jl_options.cpu_target);
Expand Down
3 changes: 2 additions & 1 deletion src/jitlayers.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ extern Function *juliapersonality_func;

#ifdef JULIA_ENABLE_THREADING
extern size_t jltls_states_func_idx;
extern size_t jltls_offset_idx;
#endif

typedef struct {Value *gv; int32_t index;} jl_value_llvm; // uses 1-based indexing
Expand All @@ -66,7 +67,7 @@ void addOptimizationPasses(PassManager *PM);
#endif
void* jl_emit_and_add_to_shadow(GlobalVariable *gv, void *gvarinit = NULL);
GlobalVariable *jl_emit_sysimg_slot(Module *m, Type *typ, const char *name,
uintptr_t init, size_t &idx);
uintptr_t init, size_t &idx);
void* jl_get_global(GlobalVariable *gv);
GlobalVariable *jl_get_global_for(const char *cname, void *addr, Module *M);
void jl_add_to_shadow(Module *m);
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ void _julia_init(JL_IMAGE_SEARCH rel);
void jl_set_base_ctx(char *__stk);

extern ssize_t jl_tls_offset;
extern const int jl_tls_elf_support;
void jl_init_threading(void);
void jl_start_threads(void);
void jl_shutdown_threading(void);
Expand Down
192 changes: 121 additions & 71 deletions src/llvm-ptls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#if JL_LLVM_VERSION >= 30700 && defined(JULIA_ENABLE_THREADING)
# include <llvm/IR/InlineAsm.h>
# include <llvm/Transforms/Utils/BasicBlockUtils.h>
#endif
#include "fix_llvm_assert.h"

Expand Down Expand Up @@ -74,6 +75,84 @@ static void ensure_global(const char *name, Type *t, Module &M,
#endif // _OS_WINDOWS_
}

#ifdef JULIA_ENABLE_THREADING
static void setCallPtlsAttrs(CallInst *ptlsStates)
{
#if JL_LLVM_VERSION >= 50000
ptlsStates->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
ptlsStates->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
#else
ptlsStates->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
ptlsStates->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
#endif
}

#if JL_LLVM_VERSION >= 30700
static Instruction *emit_ptls_tp(LLVMContext &ctx, Value *offset, Type *T_ppjlvalue,
Instruction *insertBefore)
{
auto T_int8 = Type::getInt8Ty(ctx);
auto T_pint8 = PointerType::get(T_int8, 0);
# if defined(_CPU_X86_64_) || defined(_CPU_X86_)
// Workaround LLVM bug by hiding the offset computation
// (and therefore the optimization opportunity) from LLVM.
// Ref https://github.com/JuliaLang/julia/issues/17288
static const std::string const_asm_str = [&] () {
std::stringstream stm;
# if defined(_CPU_X86_64_)
stm << "movq %fs:0, $0;\naddq $$" << jl_tls_offset << ", $0";
# else
stm << "movl %gs:0, $0;\naddl $$" << jl_tls_offset << ", $0";
# endif
return stm.str();
}();
# if defined(_CPU_X86_64_)
const char *dyn_asm_str = "movq %fs:0, $0;\naddq $1, $0";
# else
const char *dyn_asm_str = "movl %gs:0, $0;\naddl $1, $0";
# endif

// The add instruction clobbers flags
Value *tls;
if (offset) {
std::vector<Type*> args(0);
args.push_back(offset->getType());
auto tp = InlineAsm::get(FunctionType::get(T_pint8, args, false),
dyn_asm_str, "=&r,r,~{dirflag},~{fpsr},~{flags}", false);
tls = CallInst::Create(tp, offset, "ptls_i8", insertBefore);
}
else {
auto tp = InlineAsm::get(FunctionType::get(T_pint8, false),
const_asm_str.c_str(), "=r,~{dirflag},~{fpsr},~{flags}", false);
tls = CallInst::Create(tp, "ptls_i8", insertBefore);
}
return new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0), "ptls", insertBefore);
# elif defined(_CPU_AARCH64_)
// AArch64 doesn't seem to have this issue.
// (Possibly because there are many more registers and the offset is
// positive and small)
// It's also harder to emit the offset in a generic way on AArch64
// (need to generate one or two `add` with shift) so let llvm emit
// the add for now.
const char *asm_str = "mrs $0, tpidr_el0";
if (!offset) {
auto T_size = (sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) : Type::getInt32Ty(ctx));
offset = ConstantInt::getSigned(T_size, jl_tls_offset);
}
auto tp = InlineAsm::get(FunctionType::get(T_pint8, false), asm_str, "=r", false);
Value *tls = CallInst::Create(tp, "thread_ptr", insertBefore);
tls = GetElementPtrInst::Create(T_int8, tls, {offset}, "ptls_i8", insertBefore);
return new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0), "ptls", insertBefore);
# else
(void)T_pint8;
assert(0 && "Cannot emit thread pointer for this architecture.");
return nullptr;
# endif
}
#endif

#endif

void LowerPTLS::runOnFunction(LLVMContext &ctx, Module &M, Function *F,
Function *ptls_getter, Type *T_ppjlvalue, MDNode *tbaa_const)
{
Expand All @@ -99,85 +178,53 @@ void LowerPTLS::runOnFunction(LLVMContext &ctx, Module &M, Function *F,
if (imaging_mode) {
GlobalVariable *GV = cast<GlobalVariable>(
M.getNamedValue("jl_get_ptls_states.ptr"));
LoadInst *getter = new LoadInst(GV, "", ptlsStates);
#if JL_LLVM_VERSION >= 30700
if (jl_tls_elf_support) {
GlobalVariable *OffsetGV = cast<GlobalVariable>(
M.getNamedValue("jl_tls_offset.val"));
// if (offset != 0)
// ptls = tp + offset;
// else
// ptls = getter();
auto offset = new LoadInst(OffsetGV, "", ptlsStates);
offset->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
auto cmp = new ICmpInst(ptlsStates, CmpInst::ICMP_NE, offset,
Constant::getNullValue(offset->getType()));
MDBuilder MDB(ctx);
SmallVector<uint32_t, 2> Weights{9, 1};
TerminatorInst *fastTerm;
TerminatorInst *slowTerm;
SplitBlockAndInsertIfThenElse(cmp, ptlsStates, &fastTerm, &slowTerm,
MDB.createBranchWeights(Weights));

auto fastTLS = emit_ptls_tp(ctx, offset, T_ppjlvalue, fastTerm);
auto phi = PHINode::Create(PointerType::get(T_ppjlvalue, 0), 2, "", ptlsStates);
ptlsStates->replaceAllUsesWith(phi);
ptlsStates->moveBefore(slowTerm);
auto getter = new LoadInst(GV, "", ptlsStates);
getter->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
ptlsStates->setCalledFunction(getter);
setCallPtlsAttrs(ptlsStates);

phi->addIncoming(fastTLS, fastTLS->getParent());
phi->addIncoming(ptlsStates, ptlsStates->getParent());

return;
}
#endif
auto getter = new LoadInst(GV, "", ptlsStates);
getter->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
ptlsStates->setCalledFunction(getter);
#if JL_LLVM_VERSION >= 50000
ptlsStates->addAttribute(AttributeList::FunctionIndex,
Attribute::ReadNone);
ptlsStates->addAttribute(AttributeList::FunctionIndex,
Attribute::NoUnwind);
#else
ptlsStates->addAttribute(AttributeSet::FunctionIndex,
Attribute::ReadNone);
ptlsStates->addAttribute(AttributeSet::FunctionIndex,
Attribute::NoUnwind);
#endif
setCallPtlsAttrs(ptlsStates);
}
#if JL_LLVM_VERSION >= 30700
else if (jl_tls_offset != -1) {
auto T_int8 = Type::getInt8Ty(ctx);
auto T_pint8 = PointerType::get(T_int8, 0);
// Replace the function call with inline assembly if we know
// how to generate it.
# if defined(_CPU_X86_64_) || defined(_CPU_X86_)
// Workaround LLVM bug by hiding the offset computation
// (and therefore the optimization opportunity) from LLVM.
static const std::string asm_str = [&] () {
std::stringstream stm;
# if defined(_CPU_X86_64_)
stm << "movq %fs:0, $0;\naddq $$" << jl_tls_offset << ", $0";
# else
stm << "movl %gs:0, $0;\naddl $$" << jl_tls_offset << ", $0";
# endif
return stm.str();
}();
// The add instruction clobbers flags
auto tp = InlineAsm::get(FunctionType::get(T_pint8, false),
asm_str.c_str(),
"=r,~{dirflag},~{fpsr},~{flags}", false);
Value *tls = CallInst::Create(tp, "ptls_i8", ptlsStates);
tls = new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0),
"ptls", ptlsStates);
# elif defined(_CPU_AARCH64_)
// AArch64 doesn't seem to have this issue.
// (Possibly because there are many more registers and the offset is
// positive and small)
// It's also harder to emit the offset in a generic way on AArch64
// (need to generate one or two `add` with shift) so let llvm emit
// the add for now.
auto T_size = (sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) :
Type::getInt32Ty(ctx));
const char *asm_str = "mrs $0, tpidr_el0";
auto offset = ConstantInt::getSigned(T_size, jl_tls_offset);
auto tp = InlineAsm::get(FunctionType::get(T_pint8, false),
asm_str, "=r", false);
Value *tls = CallInst::Create(tp, "thread_ptr", ptlsStates);
tls = GetElementPtrInst::Create(T_int8, tls, {offset},
"ptls_i8", ptlsStates);
tls = new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0),
"ptls", ptlsStates);
# else
Value *tls = nullptr;
assert(0 && "Cannot emit thread pointer for this architecture.");
# endif
(void)T_pint8;
ptlsStates->replaceAllUsesWith(tls);
ptlsStates->replaceAllUsesWith(emit_ptls_tp(ctx, nullptr, T_ppjlvalue, ptlsStates));
ptlsStates->eraseFromParent();
}
#endif
else {
#if JL_LLVM_VERSION >= 50000
ptlsStates->addAttribute(AttributeList::FunctionIndex,
Attribute::ReadNone);
ptlsStates->addAttribute(AttributeList::FunctionIndex,
Attribute::NoUnwind);
#else
ptlsStates->addAttribute(AttributeSet::FunctionIndex,
Attribute::ReadNone);
ptlsStates->addAttribute(AttributeSet::FunctionIndex,
Attribute::NoUnwind);
#endif
setCallPtlsAttrs(ptlsStates);
}
#else
ptlsStates->replaceAllUsesWith(M.getNamedValue("jl_tls_states"));
Expand All @@ -197,8 +244,11 @@ bool LowerPTLS::runOnModule(Module &M)
auto T_ppjlvalue =
cast<PointerType>(functype->getReturnType())->getElementType();
#ifdef JULIA_ENABLE_THREADING
if (imaging_mode)
if (imaging_mode) {
ensure_global("jl_get_ptls_states.ptr", functype->getPointerTo(), M);
ensure_global("jl_tls_offset.val",
sizeof(size_t) == 8 ? Type::getInt64Ty(ctx) : Type::getInt32Ty(ctx), M);
}
#else
ensure_global("jl_tls_states", T_ppjlvalue, M, imaging_mode);
#endif
Expand Down
3 changes: 3 additions & 0 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ void ti_reset_timings(void);
ssize_t jl_tls_offset = -1;

#ifdef JL_ELF_TLS_VARIANT
const int jl_tls_elf_support = 1;
// Optimize TLS access in codegen if the TLS buffer is using a IE or LE model.
// To detect such case, we find the size of the TLS segment in the main
// executable and the TIB pointer and then see if the TLS pointer on the
Expand Down Expand Up @@ -540,6 +541,8 @@ static void jl_check_tls(void)
return;
jl_tls_offset = offset;
}
#else
const int jl_tls_elf_support = 0;
#endif

// interface to Julia; sets up to make the runtime thread-safe
Expand Down

0 comments on commit 0c89c16

Please sign in to comment.