From 6ab1862106bc7f48afa54bac792cb7909df35cd7 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Thu, 5 Jan 2023 22:17:52 -0500 Subject: [PATCH] Table-based dlsym --- src/aotcompile.cpp | 112 ++++++++++++++++++++++++++++++++--- src/llvm-multiversioning.cpp | 68 ++++++++++----------- src/llvm-ptls.cpp | 19 +----- src/processor.cpp | 72 ++++++++++++---------- src/processor.h | 32 ++++++++++ 5 files changed, 214 insertions(+), 89 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 527b793f142c8..5873c1ca56477 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -424,7 +424,8 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm //Safe b/c context is locked by params GlobalVariable *G = cast(clone.getModuleUnlocked()->getNamedValue(global)); G->setInitializer(ConstantPointerNull::get(cast(G->getValueType()))); - G->setLinkage(GlobalVariable::InternalLinkage); + G->setLinkage(GlobalValue::ExternalLinkage); + G->setVisibility(GlobalValue::HiddenVisibility); data->jl_sysimg_gvars.push_back(G); } CreateNativeGlobals += gvars.size(); @@ -446,9 +447,9 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm //Safe b/c context is locked by params for (GlobalObject &G : clone.getModuleUnlocked()->global_objects()) { if (!G.isDeclaration()) { - G.setLinkage(Function::InternalLinkage); + G.setLinkage(GlobalValue::ExternalLinkage); + G.setVisibility(GlobalValue::HiddenVisibility); makeSafeName(G); - addComdat(&G); #if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_) // Add unwind exception personalities to functions to handle async exceptions if (Function *F = dyn_cast(&G)) @@ -514,6 +515,63 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT void multiversioning_preannotate(Module &M); +static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) { + SmallVector tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads); + for (unsigned i = 0; i < threads; i++) { + auto suffix = "_" + std::to_string(i); + auto create_gv = [&](StringRef name, bool constant) { + auto gv = new GlobalVariable(M, T_size, constant, + GlobalValue::ExternalLinkage, nullptr, name + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); + return gv; + }; + auto table = tables.data() + i * sizeof(jl_image_shard_t) / sizeof(void *); + table[offsetof(jl_image_shard_t, fvar_base) / sizeof(void*)] = create_gv("jl_fvar_base", false); + table[offsetof(jl_image_shard_t, fvar_offsets) / sizeof(void*)] = create_gv("jl_fvar_offsets", true); + table[offsetof(jl_image_shard_t, fvar_idxs) / sizeof(void*)] = create_gv("jl_fvar_idxs", true); + table[offsetof(jl_image_shard_t, gvar_base) / sizeof(void*)] = create_gv("jl_gvar_base", false); + table[offsetof(jl_image_shard_t, gvar_offsets) / sizeof(void*)] = create_gv("jl_gvar_offsets", true); + table[offsetof(jl_image_shard_t, gvar_idxs) / sizeof(void*)] = create_gv("jl_gvar_idxs", true); + table[offsetof(jl_image_shard_t, clone_slots) / sizeof(void*)] = create_gv("jl_clone_slots", true); + table[offsetof(jl_image_shard_t, clone_offsets) / sizeof(void*)] = create_gv("jl_clone_offsets", true); + table[offsetof(jl_image_shard_t, clone_idxs) / sizeof(void*)] = create_gv("jl_clone_idxs", true); + } + auto tables_arr = ConstantArray::get(ArrayType::get(T_psize, tables.size()), tables); + auto tables_gv = new GlobalVariable(M, tables_arr->getType(), false, + GlobalValue::ExternalLinkage, tables_arr, "jl_shard_tables"); + tables_gv->setVisibility(GlobalValue::HiddenVisibility); + return tables_gv; +} + +static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { + std::array ptls_table{ + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"), + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_key_slot"), + new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_tls_offset"), + }; + for (auto &gv : ptls_table) + cast(gv)->setVisibility(GlobalValue::HiddenVisibility); + auto ptls_table_arr = ConstantArray::get(ArrayType::get(T_psize, ptls_table.size()), ptls_table); + auto ptls_table_gv = new GlobalVariable(M, ptls_table_arr->getType(), false, + GlobalValue::ExternalLinkage, ptls_table_arr, "jl_ptls_table"); + ptls_table_gv->setVisibility(GlobalValue::HiddenVisibility); + return ptls_table_gv; +} + +static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { + constexpr uint32_t version = 1; + std::array header{ + version, + threads, + nfvars, + ngvars, + }; + auto header_arr = ConstantDataArray::get(M.getContext(), header); + auto header_gv = new GlobalVariable(M, header_arr->getType(), false, + GlobalValue::InternalLinkage, header_arr, "jl_image_header"); + return header_gv; +} + // takes the running content that has collected in the shadow module and dump it to disk // this builds the object file portion of the sysimage files for fast startup extern "C" JL_DLLEXPORT @@ -588,6 +646,10 @@ void jl_dump_native_impl(void *native_code, start = jl_hrtime(); + unsigned threads = 1; + unsigned nfvars = 0; + unsigned ngvars = 0; + // add metadata information if (imaging_mode) { multiversioning_preannotate(*dataM); @@ -601,8 +663,27 @@ void jl_dump_native_impl(void *native_code, } } } - emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_sysimg_gvars", T_psize); - emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_sysimg_fvars", T_psize); + nfvars = data->jl_sysimg_fvars.size(); + ngvars = data->jl_sysimg_gvars.size(); + emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); + emit_offset_table(*dataM, data->jl_sysimg_fvars, "jl_fvars", T_psize); + std::vector idxs; + idxs.resize(data->jl_sysimg_gvars.size()); + std::iota(idxs.begin(), idxs.end(), 0); + auto gidxs = ConstantDataArray::get(Context, idxs); + auto gidxs_var = new GlobalVariable(*dataM, gidxs->getType(), true, + GlobalVariable::ExternalLinkage, + gidxs, "jl_gvar_idxs"); + gidxs_var->setVisibility(GlobalValue::HiddenVisibility); + idxs.clear(); + idxs.resize(data->jl_sysimg_fvars.size()); + std::iota(idxs.begin(), idxs.end(), 0); + auto fidxs = ConstantDataArray::get(Context, idxs); + auto fidxs_var = new GlobalVariable(*dataM, fidxs->getType(), true, + GlobalVariable::ExternalLinkage, + fidxs, "jl_fvar_idxs"); + fidxs_var->setVisibility(GlobalValue::HiddenVisibility); + dataM->addModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(Context, "_0")); // reflect the address of the jl_RTLD_DEFAULT_handle variable // back to the caller, so that we can check for consistency issues @@ -789,10 +870,23 @@ void jl_dump_native_impl(void *native_code, data.insert(data.end(), specdata.begin(), specdata.end()); } auto value = ConstantDataArray::get(Context, data); - addComdat(new GlobalVariable(*sysimageM, value->getType(), true, - GlobalVariable::ExternalLinkage, - value, "jl_dispatch_target_ids")); - + auto target_ids = new GlobalVariable(*sysimageM, value->getType(), true, + GlobalVariable::InternalLinkage, + value, "jl_dispatch_target_ids"); + auto shards = emit_shard_table(*sysimageM, T_size, T_psize, threads); + auto ptls = emit_ptls_table(*sysimageM, T_size, T_psize); + auto header = emit_image_header(*sysimageM, threads, nfvars, ngvars); + auto AT = ArrayType::get(T_psize, 4); + auto pointers = new GlobalVariable(*sysimageM, AT, false, + GlobalVariable::ExternalLinkage, + ConstantArray::get(AT, { + ConstantExpr::getBitCast(header, T_psize), + ConstantExpr::getBitCast(shards, T_psize), + ConstantExpr::getBitCast(ptls, T_psize), + ConstantExpr::getBitCast(target_ids, T_psize) + }), + "jl_image_pointers"); + addComdat(pointers); if (s) { write_int32(s, data.size()); ios_write(s, (const char *)data.data(), data.size()); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 1a1dc297b2702..44c83502e0537 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -516,8 +516,8 @@ static inline std::vector consume_gv(Module &M, const char *name, bool allow CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) : tbaa_const(tbaa_make_child_with_context(M.getContext(), "jtbaa_const", nullptr, true).first), specs(jl_get_llvm_clone_targets()), - fvars(consume_gv(M, "jl_sysimg_fvars", allow_bad_fvars)), - gvars(consume_gv(M, "jl_sysimg_gvars", false)), + fvars(consume_gv(M, "jl_fvars", allow_bad_fvars)), + gvars(consume_gv(M, "jl_gvars", false)), M(M), allow_bad_fvars(allow_bad_fvars) { @@ -547,7 +547,7 @@ CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) for (uint32_t i = 0; i < nfvars; i++) func_ids[fvars[i]] = i + 1; for (auto &F: M) { - if (F.empty()) + if (F.empty() && !F.hasFnAttribute("julia.mv.clones")) continue; orig_funcs.push_back(&F); } @@ -898,19 +898,6 @@ void CloneCtx::fix_inst_uses() } } -template -static inline T *add_comdat(T *G) -{ -#if defined(_OS_WINDOWS_) - // add __declspec(dllexport) to everything marked for export - if (G->getLinkage() == GlobalValue::ExternalLinkage) - G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); - else - G->setDLLStorageClass(GlobalValue::DefaultStorageClass); -#endif - return G; -} - static Constant *get_ptrdiff32(Constant *ptr, Constant *base) { if (ptr->getType()->isPointerTy()) @@ -920,7 +907,7 @@ static Constant *get_ptrdiff32(Constant *ptr, Constant *base) } template -static Constant *emit_offset_table(Module &M, const std::vector &vars, StringRef name) +static Constant *emit_offset_table(Module &M, const std::vector &vars, StringRef name, StringRef suffix) { auto T_int32 = Type::getInt32Ty(M.getContext()); auto T_size = getSizeTy(M.getContext()); @@ -928,11 +915,14 @@ static Constant *emit_offset_table(Module &M, const std::vector &vars, Strin Constant *base = nullptr; if (nvars > 0) { base = ConstantExpr::getBitCast(vars[0], T_size->getPointerTo()); - add_comdat(GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage, - name + "_base", - base, &M)); + auto ga = GlobalAlias::create(T_size, 0, GlobalVariable::ExternalLinkage, + name + "_base" + suffix, + base, &M); + ga->setVisibility(GlobalValue::HiddenVisibility); } else { - base = add_comdat(new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base")); + auto gv = new GlobalVariable(M, T_size, true, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), name + "_base" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); + base = gv; } auto vbase = ConstantExpr::getPtrToInt(base, T_size); std::vector offsets(nvars + 1); @@ -943,10 +933,11 @@ static Constant *emit_offset_table(Module &M, const std::vector &vars, Strin offsets[i + 1] = get_ptrdiff32(vars[i], vbase); } ArrayType *vars_type = ArrayType::get(T_int32, nvars + 1); - add_comdat(new GlobalVariable(M, vars_type, true, + auto gv = new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(vars_type, offsets), - name + "_offsets")); + name + "_offsets" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); return vbase; } @@ -958,9 +949,17 @@ void CloneCtx::emit_metadata() return; } + StringRef suffix; + if (auto suffix_md = M.getModuleFlag("julia.mv.suffix")) { + suffix = cast(suffix_md)->getString(); + } + // Store back the information about exported functions. - auto fbase = emit_offset_table(M, fvars, "jl_sysimg_fvars"); - auto gbase = emit_offset_table(M, gvars, "jl_sysimg_gvars"); + auto fbase = emit_offset_table(M, fvars, "jl_fvar", suffix); + auto gbase = emit_offset_table(M, gvars, "jl_gvar", suffix); + + M.getGlobalVariable("jl_fvar_idxs")->setName("jl_fvar_idxs" + suffix); + M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs" + suffix); uint32_t ntargets = specs.size(); @@ -996,9 +995,10 @@ void CloneCtx::emit_metadata() } values[0] = ConstantInt::get(T_int32, values.size() / 2); ArrayType *vars_type = ArrayType::get(T_int32, values.size()); - add_comdat(new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, + auto gv = new GlobalVariable(M, vars_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(vars_type, values), - "jl_dispatch_reloc_slots")); + "jl_clone_slots" + suffix); + gv->setVisibility(GlobalValue::HiddenVisibility); } // Generate `jl_dispatch_fvars_idxs` and `jl_dispatch_fvars_offsets` @@ -1046,14 +1046,16 @@ void CloneCtx::emit_metadata() idxs[len_idx] = count; } auto idxval = ConstantDataArray::get(M.getContext(), idxs); - add_comdat(new GlobalVariable(M, idxval->getType(), true, + auto gv1 = new GlobalVariable(M, idxval->getType(), true, GlobalVariable::ExternalLinkage, - idxval, "jl_dispatch_fvars_idxs")); + idxval, "jl_clone_idxs" + suffix); + gv1->setVisibility(GlobalValue::HiddenVisibility); ArrayType *offsets_type = ArrayType::get(Type::getInt32Ty(M.getContext()), offsets.size()); - add_comdat(new GlobalVariable(M, offsets_type, true, + auto gv2 = new GlobalVariable(M, offsets_type, true, GlobalVariable::ExternalLinkage, ConstantArray::get(offsets_type, offsets), - "jl_dispatch_fvars_offsets")); + "jl_clone_offsets" + suffix); + gv2->setVisibility(GlobalValue::HiddenVisibility); } } @@ -1070,8 +1072,8 @@ static bool runMultiVersioning(Module &M, bool allow_bad_fvars) if (M.getName() == "sysimage") return false; - GlobalVariable *fvars = M.getGlobalVariable("jl_sysimg_fvars"); - GlobalVariable *gvars = M.getGlobalVariable("jl_sysimg_gvars"); + GlobalVariable *fvars = M.getGlobalVariable("jl_fvars"); + GlobalVariable *gvars = M.getGlobalVariable("jl_gvars"); if (allow_bad_fvars && (!fvars || !fvars->hasInitializer() || !isa(fvars->getInitializer()) || !gvars || !gvars->hasInitializer() || !isa(gvars->getInitializer()))) return false; diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index ea92e1709c597..e49b992ded50f 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -140,26 +140,11 @@ GlobalVariable *LowerPTLS::create_aliased_global(Type *T, StringRef name) const // the address is visible externally but LLVM can still assume that the // address of this variable doesn't need dynamic relocation // (can be accessed with a single PC-rel load). - auto GV = new GlobalVariable(*M, T, false, GlobalVariable::InternalLinkage, - Constant::getNullValue(T), name + ".real"); - add_comdat(GlobalAlias::create(T, 0, GlobalVariable::ExternalLinkage, - name, GV, M)); + auto GV = new GlobalVariable(*M, T, false, GlobalVariable::ExternalLinkage, + nullptr, name); return GV; } -template -inline T *LowerPTLS::add_comdat(T *G) const -{ -#if defined(_OS_WINDOWS_) - // add __declspec(dllexport) to everything marked for export - if (G->getLinkage() == GlobalValue::ExternalLinkage) - G->setDLLStorageClass(GlobalValue::DLLExportStorageClass); - else - G->setDLLStorageClass(GlobalValue::DefaultStorageClass); -#endif - return G; -} - void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified) { if (pgcstack->use_empty()) { diff --git a/src/processor.cpp b/src/processor.cpp index a8aca2a64ab19..ea8e4101e6c33 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -21,6 +21,8 @@ #include #endif +#include + // CPU target string is a list of strings separated by `;` each string starts with a CPU // or architecture name and followed by an optional list of features separated by `,`. // A "generic" or empty CPU name means the basic required feature set of the target ISA @@ -629,47 +631,42 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) { jl_image_t res{}; - // .data base - char *data_base; - jl_dlsym(hdl, "jl_sysimg_gvars_base", (void**)&data_base, 1); + const jl_image_pointers_t *pointers; + jl_dlsym(hdl, "jl_image_pointers", (void**)&pointers, 1); - { - void *pgcstack_func_slot; - if (jl_dlsym(hdl, "jl_pgcstack_func_slot", &pgcstack_func_slot, 0)) { - void *pgcstack_key_slot; - jl_dlsym(hdl, "jl_pgcstack_key_slot", &pgcstack_key_slot, 1); - jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); - - size_t *tls_offset_idx; - jl_dlsym(hdl, "jl_tls_offset", (void **)&tls_offset_idx, 1); - *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); - } - } + const void *ids = pointers->target_data; + uint32_t target_idx = callback(ids); + + std::cout << "Finished callback\n"; + + auto shard = pointers->shards[0]; + + std::cout << "Shard access is ok\n"; + + // .data base + char *data_base = (char *)shard.gvar_base; // .text base - char *text_base; - jl_dlsym(hdl, "jl_sysimg_fvars_base", (void**)&text_base, 1); + const char *text_base = shard.fvar_base; - const int32_t *offsets; - jl_dlsym(hdl, "jl_sysimg_fvars_offsets", (void**)&offsets, 1); + const int32_t *offsets = shard.fvar_offsets; uint32_t nfunc = offsets[0]; offsets++; - const void *ids; - jl_dlsym(hdl, "jl_dispatch_target_ids", (void**)&ids, 1); - uint32_t target_idx = callback(ids); + std::cout << "Initial offsets\n"; - const int32_t *reloc_slots; - jl_dlsym(hdl, "jl_dispatch_reloc_slots", (void **)&reloc_slots, 1); + const int32_t *reloc_slots = shard.clone_slots; + std::cout << reloc_slots << "\n"; const uint32_t nreloc = reloc_slots[0]; reloc_slots += 1; - const uint32_t *clone_idxs; - const int32_t *clone_offsets; - jl_dlsym(hdl, "jl_dispatch_fvars_idxs", (void**)&clone_idxs, 1); - jl_dlsym(hdl, "jl_dispatch_fvars_offsets", (void**)&clone_offsets, 1); + std::cout << "Set reloc_slots\n"; + const uint32_t *clone_idxs = shard.clone_idxs; + const int32_t *clone_offsets = shard.clone_offsets; uint32_t tag_len = clone_idxs[0]; clone_idxs += 1; + std::cout << "Set clone_idxs\n"; + assert(tag_len & jl_sysimg_tag_mask); std::vector base_offsets = {offsets}; // Find target @@ -688,6 +685,8 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) base_offsets.push_back(tag_len & jl_sysimg_tag_mask ? clone_offsets : nullptr); } + std::cout << "Set offsets\n"; + bool clone_all = (tag_len & jl_sysimg_tag_mask) != 0; // Fill in return value if (clone_all) { @@ -741,17 +740,19 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) (void)found; } + std::cout << "Finished relocation\n"; + res.fptrs.base = text_base; res.fptrs.offsets = offsets; res.gvars_base = (uintptr_t *)data_base; - jl_dlsym(hdl, "jl_sysimg_gvars_offsets", (void **)&res.gvars_offsets, 1); + res.gvars_offsets = shard.gvar_offsets; res.gvars_offsets += 1; #ifdef _OS_WINDOWS_ res.base = (intptr_t)hdl; #else Dl_info dlinfo; - if (dladdr((void*)res.gvars_base, &dlinfo) != 0) { + if (dladdr((void*)pointers, &dlinfo) != 0) { res.base = (intptr_t)dlinfo.dli_fbase; } else { @@ -759,6 +760,17 @@ static inline jl_image_t parse_sysimg(void *hdl, F &&callback) } #endif + std::cout << "Starting ptls\n"; + + { + void *pgcstack_func_slot = pointers->ptls->pgcstack_func_slot; + void *pgcstack_key_slot = pointers->ptls->pgcstack_key_slot; + jl_pgcstack_getkey((jl_get_pgcstack_func**)pgcstack_func_slot, (jl_pgcstack_key_t*)pgcstack_key_slot); + + size_t *tls_offset_idx = pointers->ptls->tls_offset; + *tls_offset_idx = (uintptr_t)(jl_tls_offset == -1 ? 0 : jl_tls_offset); + } + return res; } diff --git a/src/processor.h b/src/processor.h index f76722e885a1d..73271290eff76 100644 --- a/src/processor.h +++ b/src/processor.h @@ -162,6 +162,38 @@ typedef struct { jl_image_fptrs_t fptrs; } jl_image_t; +typedef struct { + uint32_t version; + uint32_t nshards; + uint32_t nfvars; + uint32_t ngvars; +} jl_image_header_t; + +typedef struct { + const char *fvar_base; + const int32_t *fvar_offsets; + const uint32_t *fvar_idxs; + uintptr_t *gvar_base; + const int32_t *gvar_offsets; + const uint32_t *gvar_idxs; + const int32_t *clone_slots; + const int32_t *clone_offsets; + const uint32_t *clone_idxs; +} jl_image_shard_t; + +typedef struct { + void *pgcstack_func_slot; + void *pgcstack_key_slot; + size_t *tls_offset; +} jl_image_ptls_t; + +typedef struct { + const jl_image_header_t *header; + const jl_image_shard_t *shards; // nshards-length array + const jl_image_ptls_t *ptls; + const void *target_data; +} jl_image_pointers_t; + /** * Initialize the processor dispatch system with sysimg `hdl` (also initialize the sysimg itself). * The dispatch system will find the best implementation to be used in this session.