From 6b8ec27dbc582ba67f717e400b1bcff8f886c6d3 Mon Sep 17 00:00:00 2001 From: Prem Chintalapudi Date: Sun, 5 Mar 2023 23:57:51 -0500 Subject: [PATCH] Add some documentation --- src/aotcompile.cpp | 52 +++++++++-- src/llvm-multiversioning.cpp | 2 + src/processor.h | 164 +++++++++++++++++++---------------- 3 files changed, 133 insertions(+), 85 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index d512ad586a680..0337602cde27e 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -505,6 +505,7 @@ static void injectCRTAlias(Module &M, StringRef name, StringRef alias, FunctionT void multiversioning_preannotate(Module &M); +// See src/processor.h for documentation about this table. Corresponds to jl_image_shard_t. static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, unsigned threads) { SmallVector tables(sizeof(jl_image_shard_t) / sizeof(void *) * threads); for (unsigned i = 0; i < threads; i++) { @@ -533,6 +534,7 @@ static GlobalVariable *emit_shard_table(Module &M, Type *T_size, Type *T_psize, return tables_gv; } +// See src/processor.h for documentation about this table. Corresponds to jl_image_ptls_t. static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { std::array ptls_table{ new GlobalVariable(M, T_size, false, GlobalValue::ExternalLinkage, Constant::getNullValue(T_size), "jl_pgcstack_func_slot"), @@ -548,6 +550,7 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_psize) { return ptls_table_gv; } +// See src/processor.h for documentation about this table. Corresponds to jl_image_header_t. static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { constexpr uint32_t version = 1; std::array header{ @@ -562,13 +565,7 @@ static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned n return header_gv; } -struct Partition { - StringSet<> globals; - StringMap fvars; - StringMap gvars; - size_t weight; -}; - +// Grab fvars and gvars data from the module static void get_fvars_gvars(Module &M, DenseMap &fvars, DenseMap &gvars) { auto fvars_gv = M.getGlobalVariable("jl_fvars"); auto gvars_gv = M.getGlobalVariable("jl_gvars"); @@ -600,6 +597,11 @@ static void get_fvars_gvars(Module &M, DenseMap &fvars, gvars_idxs->eraseFromParent(); } +// Weight computation +// It is important for multithreaded image building to be able to split work up +// among the threads equally. The weight calculated here is an estimation of +// how expensive a particular function is going to be to compile. + struct FunctionInfo { size_t weight; size_t bbs; @@ -667,6 +669,13 @@ ModuleInfo compute_module_info(Module &M) { return info; } +struct Partition { + StringSet<> globals; + StringMap fvars; + StringMap gvars; + size_t weight; +}; + static inline bool verify_partitioning(const SmallVectorImpl &partitions, const Module &M, size_t fvars_size, size_t gvars_size) { bool bad = false; #ifndef JL_NDEBUG @@ -729,7 +738,7 @@ static inline bool verify_partitioning(const SmallVectorImpl &partiti return !bad; } -// Chop a module up as equally as possible into threads partitions +// Chop a module up as equally as possible by weight into threads partitions static SmallVector partitionModule(Module &M, unsigned threads) { //Start by stripping fvars and gvars, which helpfully removes their uses as well DenseMap fvars, gvars; @@ -926,6 +935,7 @@ struct ShardTimers { } }; +// Perform the actual optimization and emission of the output files static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, ArrayRef names, NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_, ShardTimers &timers, unsigned shardidx) { @@ -1048,6 +1058,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out } } +// serialize module to bitcode static auto serializeModule(const Module &M) { assert(!verifyModule(M, &errs()) && "Serializing invalid module!"); SmallVector ClonedModuleBuffer; @@ -1058,6 +1069,12 @@ static auto serializeModule(const Module &M) { return ClonedModuleBuffer; } +// Modules are deserialized lazily by LLVM, to avoid deserializing +// unnecessary functions. We take advantage of this by serializing +// the entire module once, then deleting the bodies of functions +// that are not in this partition. Once unnecesary functions are +// deleted, we then materialize the entire module to make use-lists +// consistent. static void materializePreserved(Module &M, Partition &partition) { DenseSet Preserve; for (auto &GV : M.global_values()) { @@ -1083,6 +1100,12 @@ static void materializePreserved(Module &M, Partition &partition) { } } } + // Global aliases are a pain to deal with. It is illegal to have an alias to a declaration, + // so we need to replace them with either a function or a global variable declaration. However, + // we can't just delete the alias, because that would break the users of the alias. Therefore, + // we do a dance where we point each global alias to a dummy function or global variable, + // then materialize the module to access use-lists, then replace all the uses, and finally commit + // to deleting the old alias. SmallVector> DeletedAliases; for (auto &GA : M.aliases()) { if (!GA.isDeclaration()) { @@ -1116,6 +1139,7 @@ static void materializePreserved(Module &M, Partition &partition) { } } +// Reconstruct jl_fvars, jl_gvars, jl_fvars_idxs, and jl_gvars_idxs from the partition static void construct_vars(Module &M, Partition &partition) { std::vector> fvar_pairs; fvar_pairs.reserve(partition.fvars.size()); @@ -1168,6 +1192,8 @@ static void construct_vars(Module &M, Partition &partition) { gidxs_var->setVisibility(GlobalValue::HiddenVisibility); } +// Materialization will leave many unused declarations, which multiversioning would otherwise clone. +// This function removes them to avoid unnecessary cloning of declarations. static void dropUnusedDeclarations(Module &M) { SmallVector unused; for (auto &G : M.global_values()) { @@ -1184,6 +1210,8 @@ static void dropUnusedDeclarations(Module &M) { G->eraseFromParent(); } +// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, +// as well as partitioning, serialization, and deserialization. static void add_output(Module &M, TargetMachine &TM, std::vector &outputs, ArrayRef names, std::vector &unopt, std::vector &opt, std::vector &obj, std::vector &asm_, @@ -1198,6 +1226,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o asm_.resize(asm_.size() + asm_out * threads); auto name = names[2]; name.consume_back(".o"); + // Timers for timing purposes TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); SmallVector timers(threads); for (unsigned i = 0; i < threads; ++i) { @@ -1232,6 +1261,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; } } + // Single-threaded case if (threads == 1) { output_timer.startTimer(); add_output_impl(M, TM, outputs.data() + outputs.size() - outcount, names, @@ -1255,6 +1285,8 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o partition_timer.startTimer(); uint64_t counter = 0; + // Partitioning requires all globals to have names. + // We use a prefix to avoid name conflicts with user code. for (auto &G : M.global_values()) { if (!G.isDeclaration() && !G.hasName()) { G.setName("jl_ext_" + Twine(counter++)); @@ -1262,6 +1294,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o } auto partitions = partitionModule(M, threads); partition_timer.stopTimer(); + serialize_timer.startTimer(); auto serialized = serializeModule(M); serialize_timer.stopTimer(); @@ -1274,10 +1307,12 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o auto objstart = obj_out ? obj.data() + obj.size() - threads : nullptr; auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr; + // Start all of the worker threads std::vector workers(threads); for (unsigned i = 0; i < threads; i++) { workers[i] = std::thread([&, i](){ LLVMContext ctx; + // Lazily deserialize the entire module timers[i].deserialize.startTimer(); auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module"); timers[i].deserialize.stopTimer(); @@ -1304,6 +1339,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector &o }); } + // Wait for all of the worker threads to finish for (auto &w : workers) w.join(); diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index cbce76d702119..0474cb0c7add7 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -3,6 +3,8 @@ // Function multi-versioning // LLVM pass to clone function for different archs +//see src/processor.h for documentation of the relevant globals inserted here + #include "llvm-version.h" #include "passes.h" diff --git a/src/processor.h b/src/processor.h index 6445f221882ba..497a93d40e11f 100644 --- a/src/processor.h +++ b/src/processor.h @@ -14,82 +14,9 @@ extern "C" { #endif -/** - * Related sysimg exported symbols - * - * In the following text, function refers to an abstract entity. - * It corresponds to a `Function` that we emit in the codegen, and there might be multiple copies - * of it in the system image. Only one of those copies will be used in a given session. - * Function pointers refer to a real piece of code in the system image. - * Each function might have multiple function pointers in the system image - * and each function pointer will correspond to only one function. - * - * # Global function and base pointers - * `jl_sysimg_gvars_base`: - * The address of this symbol is the base data pointer - * (all other data pointers are stored as offsets to this address) - * `jl_sysimg_fvars_base`: - * The address of this symbol is the base function pointer - * (all other function pointers are stored as offsets to this address) - * `jl_sysimg_fvars_offsets`: [static data] - * The array of function pointer offsets (`int32_t`) from the base pointer. - * This includes all julia functions in sysimg as well as all other functions that are cloned. - * The default function pointer is used if the function is cloned. - * The first element is the size of the array, which should **NOT** be used as the number - * of julia functions in the sysimg. - * Each entry in this array uniquely identifies a function we are interested in - * (the function may have multiple function pointers corresponding to different versions). - * In other sysimg info, all references to functions are stored as their `uint32_t` index - * in this array. - * - * # Target data and dispatch slots (Only needed by runtime during loading) - * `jl_dispatch_target_ids`: [static data] serialize target data. - * This contains the number of targets which is needed to decode `jl_dispatch_fvars_idxs` - * in addition to the name and feature set of each target. - * `jl_dispatch_reloc_slots`: [static data] location and index of relocation slots. - * Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`. - * The first element is an `uint32_t` giving the number of relocations. - * This is needed for functions whose address is used in a way that requires dispatch. - * We currently only support one type of relocation (i.e. absolute pointer) which is enough - * for all use in functions as well as GOT slot (for "PLT" callback). - * Note that not all functions being cloned are assigned a slot. - * This array is sorted by the function indices. - * There can be more than one slot per-function, - * i.e. there can be duplicated function indices. - * - * # Target functions - * `jl_dispatch_fvars_idxs`: [static data] Target-specific function indices. - * For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index - * of the base target followed by an array of tagged function indices. - * The base target index is required to be smaller than the index of the current target - * and must be the default (`0`) or a `clone_all` target. - * If it's not `0`, the function pointer array for the `clone_all` target will be used as - * the base function pointer offsets instead. - * The tag bits for both the length and the indices are the top bit. - * A tagged length indicates that all of the functions are cloned and the indices follows - * are the ones that requires relocation. The base target index is omitted in this case. - * Otherwise, the length is the total number of functions that we are interested in - * for this target, which includes all cloned julia functions and - * all other cloned functions that requires relocation. - * A tagged index means that the function pointer should be filled into the GOT slots - * identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function. - * (Note that a tagged index could corresponds to a functions pointer that's the same as - * the base one since this is the only way we currently represent relocations.) - * A tagged length implicitly tags all the indices and the indices will not have the tag bit - * set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`. - * `jl_dispatch_fvars_offsets`: [static data] Target-specific function pointer offsets. - * This contains all the cloned functions that we are interested in and it needs to be decoded - * and used along with `jl_dispatch_fvars_idxs`. - * For the default target, there's no entries in this variable, if there's any relocations - * needed for the default target, the function pointers are taken from the global offset - * arrays directly. - * For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this - * variable contains an offset array of the same length as the global one. Only the indices - * appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return - * this array as the original/base function offsets. - * For other targets, this variable contains an offset array with the length defined in - * `jl_dispatch_fvars_idxs`. Tagged indices need relocations. - */ +// Image metadata +// Every image exports a `jl_image_pointers_t` as a global symbol `jl_image_pointers`. +// This symbol acts as a root for all other code-related symbols in the image. enum { JL_TARGET_VEC_CALL = 1 << 0, @@ -163,35 +90,118 @@ typedef struct { jl_image_fptrs_t fptrs; } jl_image_t; +// The header for each image +// Details important counts about the image typedef struct { + // The version of the image format + // Most up-to-date version is 1 uint32_t version; + // The number of shards in this image uint32_t nshards; + // The total number of fvars in this image among all shards uint32_t nfvars; + // The total number of gvars in this image among all shards uint32_t ngvars; } jl_image_header_t; +// Per-shard data for image shards. Each image contains header->nshards of these. typedef struct { + + // This is the base function pointer + // (all other function pointers are stored as offsets to this address) const char *fvar_base; + + // The array of function pointer offsets (`int32_t`) from the base pointer. + // This includes all julia functions in sysimg as well as all other functions that are cloned. + // The default function pointer is used if the function is cloned. + // The first element is the size of the array, which should **NOT** be used as the number + // of julia functions in the sysimg. + // Each entry in this array uniquely identifies a function we are interested in + // (the function may have multiple function pointers corresponding to different versions). + // In other sysimg info, all references to functions are stored as their `uint32_t` index + // in this array. const int32_t *fvar_offsets; + // This is the mapping of shard function index -> global function index + // staticdata.c relies on the same order of functions in the global function array being + // the same as what it saw when serializing the global function array. However, partitioning + // into multiple shards will cause functions to be reordered. This array is used to map + // back to the original function array for loading. const uint32_t *fvar_idxs; + // This is the base data pointer + // (all other data pointers in this shard are stored as offsets to this address) uintptr_t *gvar_base; + // This array of global variable offsets (`int32_t`) from the base pointer. + // Similar to fvar_offsets, but for gvars const int32_t *gvar_offsets; + // This is the mapping of shard global variable index -> global global variable index + // Similar to fvar_idxs, but for gvars const uint32_t *gvar_idxs; + + // location and index of relocation slots. + // Stored as pairs of function indices and `int32_t` offsets from `jl_sysimg_gvars_base`. + // The first element is an `uint32_t` giving the number of relocations. + // This is needed for functions whose address is used in a way that requires dispatch. + // We currently only support one type of relocation (i.e. absolute pointer) which is enough + // for all use in functions as well as GOT slot (for "PLT" callback). + // Note that not all functions being cloned are assigned a slot. + // This array is sorted by the function indices. + // There can be more than one slot per-function, + // i.e. there can be duplicated function indices. const int32_t *clone_slots; + // Target-specific function pointer offsets. + // This contains all the cloned functions that we are interested in and it needs to be decoded + // and used along with `jl_dispatch_fvars_idxs`. + // For the default target, there's no entries in this variable, if there's any relocations + // needed for the default target, the function pointers are taken from the global offset + // arrays directly. + // For a `clone_all` target (i.e. with the length in `jl_dispatch_fvars_idxs` tagged), this + // variable contains an offset array of the same length as the global one. Only the indices + // appearing in `jl_dispatch_fvars_idxs` need relocation and the dispatch code should return + // this array as the original/base function offsets. + // For other targets, this variable contains an offset array with the length defined in + // `jl_dispatch_fvars_idxs`. Tagged indices need relocations. const int32_t *clone_offsets; + // Target-specific function indices. + // For each target, this includes a tagged `uint32_t` length, an optional `uint32_t` index + // of the base target followed by an array of tagged function indices. + // The base target index is required to be smaller than the index of the current target + // and must be the default (`0`) or a `clone_all` target. + // If it's not `0`, the function pointer array for the `clone_all` target will be used as + // the base function pointer offsets instead. + // The tag bits for both the length and the indices are the top bit. + // A tagged length indicates that all of the functions are cloned and the indices follows + // are the ones that requires relocation. The base target index is omitted in this case. + // Otherwise, the length is the total number of functions that we are interested in + // for this target, which includes all cloned julia functions and + // all other cloned functions that requires relocation. + // A tagged index means that the function pointer should be filled into the GOT slots + // identified by `jl_dispatch_reloc_slots`. There could be more than one slot per function. + // (Note that a tagged index could corresponds to a functions pointer that's the same as + // the base one since this is the only way we currently represent relocations.) + // A tagged length implicitly tags all the indices and the indices will not have the tag bit + // set. The lengths in this variable is needed to decode `jl_dispatch_fvars_offsets`. const uint32_t *clone_idxs; } jl_image_shard_t; +// The TLS data for each image typedef struct { void *pgcstack_func_slot; void *pgcstack_key_slot; size_t *tls_offset; } jl_image_ptls_t; +//The root struct for images, points to all the other globals typedef struct { + // The image header, contains numerical global data const jl_image_header_t *header; - const jl_image_shard_t *shards; // nshards-length array + // The shard table, contains per-shard data + const jl_image_shard_t *shards; // points to header->nshards length array + // The TLS data const jl_image_ptls_t *ptls; + + // serialized target data + // This contains the number of targets + // in addition to the name and feature set of each target. const void *target_data; } jl_image_pointers_t;