Skip to content

Commit

Permalink
Formalize printing more, correct module weight estimation with multiv…
Browse files Browse the repository at this point in the history
…ersioning
  • Loading branch information
pchintalapudi committed Mar 3, 2023
1 parent bdf65f4 commit 4fc5bed
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 26 deletions.
75 changes: 53 additions & 22 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,10 @@ static size_t getFunctionWeight(const Function &F)
// add some weight to it
weight += F.size();
if (F.hasFnAttribute("julia.mv.clones")) {
weight *= F.getFnAttribute("julia.mv.clones").getValueAsString().count(',') + 1;
auto val = F.getFnAttribute("julia.mv.clones").getValueAsString();
// base16, so must be at most 4 * length bits long
// popcount gives number of clones
weight *= APInt(val.size() * 4, val, 16).countPopulation() + 1;
}
return weight;
}
Expand Down Expand Up @@ -761,7 +764,8 @@ static SmallVector<Partition, 32> partitionModule(Module &M, unsigned threads) {
}

static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *outputs, StringRef name,
NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_) {
NewArchiveMember *unopt, NewArchiveMember *opt, NewArchiveMember *obj, NewArchiveMember *asm_,
std::stringstream &stream, unsigned i) {
auto TM = std::unique_ptr<TargetMachine>(
SourceTM.getTarget().createTargetMachine(
SourceTM.getTargetTriple().str(),
Expand Down Expand Up @@ -814,7 +818,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out

end = jl_hrtime();

dbgs() << "optimize time: " << (end - start) / 1e9 << "s\n";
stream << "optimize time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

if (opt) {
raw_string_ostream OS(*outputs);
Expand Down Expand Up @@ -847,7 +851,7 @@ static void add_output_impl(Module &M, TargetMachine &SourceTM, std::string *out

end = jl_hrtime();

dbgs() << "codegen time: " << (end - start) / 1e9 << "s\n";
stream << "codegen time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

if (asm_) {
SmallVector<char, 0> Buffer;
Expand Down Expand Up @@ -1002,11 +1006,14 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
asm_.resize(asm_.size() + asm_out * threads);
if (threads == 1) {
start = jl_hrtime();
std::stringstream stream;
add_output_impl(M, TM, outputs.data() + outputs.size() - outcount * 2, name,
unopt_out ? unopt.data() + unopt.size() - 1 : nullptr,
opt_out ? opt.data() + opt.size() - 1 : nullptr,
obj_out ? obj.data() + obj.size() - 1 : nullptr,
asm_out ? asm_.data() + asm_.size() - 1 : nullptr);
asm_out ? asm_.data() + asm_.size() - 1 : nullptr,
stream, 0);
dbgs() << stream.str();
end = jl_hrtime();
dbgs() << "Time to add output: " << (end - start) / 1e9 << "s\n";
return;
Expand Down Expand Up @@ -1034,6 +1041,7 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
auto asmstart = asm_out ? asm_.data() + asm_.size() - threads : nullptr;

std::vector<std::thread> workers(threads);
std::vector<std::stringstream> stderrs(threads);
for (unsigned i = 0; i < threads; i++) {
workers[i] = std::thread([&, i](){
LLVMContext ctx;
Expand All @@ -1042,43 +1050,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
start = jl_hrtime();
auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
end = jl_hrtime();
dbgs() << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
stderrs[i] << "Deserialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

dbgs() << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";
stderrs[i] << "Starting shard " << i << " with weight=" << partitions[i].weight << "\n";

start = jl_hrtime();
materializePreserved(*M, partitions[i]);
end = jl_hrtime();
dbgs() << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
stderrs[i] << "Materialization time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

start = jl_hrtime();
construct_vars(*M, partitions[i]);
M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
end = jl_hrtime();

dbgs() << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
stderrs[i] << "Construction time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

start = jl_hrtime();
dropUnusedDeclarations(*M);
end = jl_hrtime();

dbgs() << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
stderrs[i] << "Declaration deletion time for shard " << i << ": " << (end - start) / 1e9 << "s\n";

start = jl_hrtime();
add_output_impl(*M, TM, outstart + i * outcount * 2, name,
unoptstart ? unoptstart + i : nullptr,
optstart ? optstart + i : nullptr,
objstart ? objstart + i : nullptr,
asmstart ? asmstart + i : nullptr);
asmstart ? asmstart + i : nullptr,
stderrs[i], i);
end = jl_hrtime();

dbgs() << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
stderrs[i] << "Output time for shard " << i << ": " << (end - start) / 1e9 << "s\n";
});
}

start = jl_hrtime();
for (auto &w : workers)
w.join();
for (auto &str : stderrs)
dbgs() << str.str();
end = jl_hrtime();

dbgs() << "Total time for parallel output: " << (end - start) / 1e9 << "s\n";
Expand All @@ -1087,32 +1098,46 @@ static void add_output(Module &M, TargetMachine &TM, std::vector<std::string> &o
unsigned compute_image_thread_count(Module &M) {
// 32-bit systems are very memory-constrained
#ifdef _P32
dbgs() << "Threads: 1\n";
return 1;
#endif
unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);

// memory limit check
// many threads use a lot of memory, so limit on constrained memory systems
size_t available = uv_get_available_memory();
size_t weight = 0;
size_t globals = 0;
for (auto &GV : M.global_values()) {
if (GV.isDeclaration())
continue;
globals++;
if (isa<Function>(GV)) {
weight += getFunctionWeight(cast<Function>(GV));
} else {
weight += 1;
}
}
if (weight == 0) {
dbgs() << "No globals in module, using 1 thread\n";
dbgs() << "Module weight: " << weight << "\n";
if (weight < 1000) {
dbgs() << "Low module complexity bailout\n";
dbgs() << "Threads: 1\n";
return 1;
}

unsigned threads = std::max(llvm::hardware_concurrency().compute_thread_count() / 2, 1u);

// memory limit check
// many threads use a lot of memory, so limit on constrained memory systems
size_t available = uv_get_available_memory();
// crude estimate, available / (weight * fudge factor) = max threads
size_t fudge = 10;
unsigned max_threads = std::max(available / (weight * fudge), (size_t)1);
dbgs() << "Weight: " << weight << ", available: " << available << ", wanted: " << threads << ", max threads: " << max_threads << "\n";
threads = std::min(threads, max_threads);
if (max_threads < threads) {
dbgs() << "Memory limiting threads to " << max_threads << "\n";
threads = max_threads;
}

max_threads = globals / 100;
if (max_threads < threads) {
dbgs() << "Low global count limiting threads to " << max_threads << " (" << globals << "globals)\n";
threads = max_threads;
}

// environment variable override
const char *env_threads = getenv("JULIA_IMAGE_THREADS");
Expand All @@ -1122,10 +1147,15 @@ unsigned compute_image_thread_count(Module &M) {
if (*endptr || !requested) {
jl_safe_printf("WARNING: invalid value '%s' for JULIA_IMAGE_THREADS\n", env_threads);
} else {
dbgs() << "Overriding threads to " << requested << "\n";
threads = requested;
}
}

threads = std::max(threads, 1u);

dbgs() << "Threads: " << threads << "\n";

return threads;
}

Expand Down Expand Up @@ -1208,7 +1238,7 @@ void jl_dump_native_impl(void *native_code,

start = jl_hrtime();

unsigned threads = compute_image_thread_count(*dataM);
unsigned threads = 1;
unsigned nfvars = 0;
unsigned ngvars = 0;

Expand All @@ -1225,6 +1255,7 @@ void jl_dump_native_impl(void *native_code,
}
}
}
threads = compute_image_thread_count(*dataM);
nfvars = data->jl_sysimg_fvars.size();
ngvars = data->jl_sysimg_gvars.size();
emit_offset_table(*dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
Expand Down
6 changes: 2 additions & 4 deletions src/llvm-multiversioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,6 @@ struct CloneCtx {
// Map from original function to one based index in `fvars`
std::map<const Function*,uint32_t> func_ids{};
std::vector<Function*> orig_funcs{};
std::vector<uint32_t> func_infos{};
std::set<Function*> cloned{};
// GV addresses and their corresponding function id (i.e. 0-based index in `fvars`)
std::vector<std::pair<Constant*,uint32_t>> gv_relocs{};
// Mapping from function id (i.e. 0-based index in `fvars`) to GVs to be initialized.
Expand Down Expand Up @@ -650,7 +648,7 @@ void CloneCtx::fix_gv_uses()
return changed;
};
for (auto orig_f: orig_funcs) {
if (groups.size() == 1 && !cloned.count(orig_f))
if (!orig_f->hasFnAttribute("julia.mv.clones"))
continue;
while (single_pass(orig_f)) {
}
Expand Down Expand Up @@ -813,7 +811,7 @@ void CloneCtx::emit_metadata()
std::set<uint32_t> shared_relocs;
{
auto T_int32 = Type::getInt32Ty(M.getContext());
std::stable_sort(gv_relocs.begin(), gv_relocs.end(),
std::sort(gv_relocs.begin(), gv_relocs.end(),
[] (const std::pair<Constant*,uint32_t> &lhs,
const std::pair<Constant*,uint32_t> &rhs) {
return lhs.second < rhs.second;
Expand Down

0 comments on commit 4fc5bed

Please sign in to comment.