Skip to content

Commit

Permalink
(Binary Analysis) New instruction frequency tools
Browse files Browse the repository at this point in the history
RPM-15
  • Loading branch information
matzke1 committed Sep 10, 2020
1 parent ec5f68b commit 3a3423c
Show file tree
Hide file tree
Showing 8 changed files with 351 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/frontend/Partitioner2/Engine.C
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,7 @@ Engine::savePartitioner(const Partitioner &partitioner, const boost::filesystem:
Partitioner
Engine::loadPartitioner(const boost::filesystem::path &name, SerialIo::Format fmt) {
Sawyer::Message::Stream info(mlog[INFO]);
info <<"reading RBA state file";
info <<"reading RBA state from " <<name;
Sawyer::Stopwatch timer;
SerialInput::Ptr archive = SerialInput::instance();
archive->format(fmt);
Expand Down
6 changes: 6 additions & 0 deletions tools/BinaryAnalysis/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ if(enable-binary-analysis)
add_executable(bat-dis bat-dis.C)
target_link_libraries(bat-dis bat ROSE_DLL)

add_executable(bat-insnfreq bat-insnfreq.C)
target_link_libraries(bat-insnfreq bat ROSE_DLL)

add_executable(bat-insnfreq-cmp bat-insnfreq-cmp.C)
target_link_libraries(bat-insnfreq-cmp bat ROSE_DLL)

add_executable(bat-entropy bat-entropy.C)
target_link_libraries(bat-entropy bat ROSE_DLL)

Expand Down
14 changes: 14 additions & 0 deletions tools/BinaryAnalysis/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ bat_dis_LDFLAGS = $(ROSE_RPATHS)
bat_dis_LDADD = libbatSupport.a $(ROSE_LIBS)
tests += bat-dis.passed

bin_PROGRAMS += bat-insnfreq
bat_insnfreq_SOURCES = bat-insnfreq.C
bat_insnfreq_CPPFLAGS = $(ROSE_INCLUDES)
bat_insnfreq_LDFLAGS = $(ROSE_RPATHS)
bat_insnfreq_LDADD = libbatSupport.a $(ROSE_LIBS)
tests += bat-insnfreq.passed

bin_PROGRAMS += bat-insnfreq-cmp
bat_insnfreq_cmp_SOURCES = bat-insnfreq-cmp.C
bat_insnfreq_cmp_CPPFLAGS = $(ROSE_INCLUDES)
bat_insnfreq_cmp_LDFLAGS = $(ROSE_RPATHS)
bat_insnfreq_cmp_LDADD = libbatSupport.a $(ROSE_LIBS)
tests += bat-insnfreq-cmp.passed

bin_PROGRAMS += bat-entropy
bat_entropy_SOURCES = bat-entropy.C
bat_entropy_CPPFLAGS = $(ROSE_INCLUDES)
Expand Down
4 changes: 4 additions & 0 deletions tools/BinaryAnalysis/Tupfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ run $(tool_compile_linkexe) --install -I. bat-cg.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-container.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-dis.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-entropy.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-insnfreq.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-insnfreq-cmp.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-linear.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-lsb.C libbatSupport
run $(tool_compile_linkexe) --install -I. bat-lsd.C libbatSupport
Expand All @@ -30,6 +32,8 @@ run $(test) bat-cfg ./bat-cfg --self-test
run $(test) bat-container ./bat-container --self-test
run $(test) bat-dis ./bat-dis --self-test
run $(test) bat-entropy ./bat-entropy --self-test
run $(test) bat-insnfreq ./bat-insnfreq --self-test
#run $(test) bat-insnfreq-cmp ./bat-insnfreq-cmp --self-test
run $(test) bat-linear ./bat-linear --self-test
run $(test) bat-lsb ./bat-lsb --self-test
run $(test) bat-lsf ./bat-lsf --self-test
Expand Down
64 changes: 64 additions & 0 deletions tools/BinaryAnalysis/bat-insnfreq-cmp.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
static const char *purpose = "compare instruction frequencies";
static const char *description =
"Reads instruction frequencies from files specified on the command-line, then compares the first file with "
"all the remaining files. The files must have been created with @sa{bat-insnfreq}(1).";

#include <rose.h>
#include <CommandLine.h> // rose
#include <Diagnostics.h> // rose

#include <batSupport.h>
#include <boost/filesystem.hpp>
#include <Sawyer/CommandLine.h>

using namespace Bat;
using namespace Rose;
using namespace Rose::BinaryAnalysis;
using namespace Sawyer::Message::Common;

static Sawyer::Message::Facility mlog;

static std::vector<boost::filesystem::path>
parseCommandLine(int argc, char *argv[]) {
using namespace Sawyer::CommandLine;

SwitchGroup generic = Rose::CommandLine::genericSwitches();

Parser parser = Rose::CommandLine::createEmptyParser(purpose, description);
parser.errorStream(mlog[FATAL]);
parser.doc("Synopsis", "@prop{programName} [@v{switches}] @v{freq_files}...");

std::vector<std::string> input = parser.parse(argc, argv).apply().unreachedArgs();
if (input.size() < 2) {
mlog[FATAL] <<"at least two frequency files must be specified\n";
exit(1);
}
return std::vector<boost::filesystem::path>(input.begin(), input.end());
}

int
main(int argc, char *argv[]) {
ROSE_INITIALIZE;
Diagnostics::initAndRegister(&mlog, "tool");
mlog.comment("instruction frequencies");
Bat::checkRoseVersionNumber(MINIMUM_ROSE_LIBRARY_VERSION, mlog[FATAL]);
Bat::registerSelfTests();
std::vector<boost::filesystem::path> fileNames = parseCommandLine(argc, argv);

// Read the first histogram
InsnHistogram a = loadInsnHistogram(fileNames[0]);
std::vector<InsnHistogram> aQuartiles = splitInsnHistogram(a, 4);

// Compare the first histogram with the rest
std::vector<std::pair<boost::filesystem::path, double>> output;
for (size_t i = 1; i < fileNames.size(); ++i) {
InsnHistogram b = loadInsnHistogram(fileNames[i]);
double diff = compareInsnHistograms(aQuartiles, b);
output.push_back(std::make_pair(fileNames[i], diff));
}

// Produce output
std::sort(output.begin(), output.end(), [](auto &a, auto &b) { return a.second < b.second; });
for (const auto &pair: output)
std::cout <<(boost::format("%7.3f%%") % (100.0*pair.second)) <<"\t" <<pair.first <<"\n";
}
96 changes: 96 additions & 0 deletions tools/BinaryAnalysis/bat-insnfreq.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
static const char *purpose = "count instructions";
static const char *description =
"Scans the memory regions marked as executable and does a linear disassembly. The instruction mnemonics are listed along "
"with the number of times they occur.";

#include <rose.h>
#include <CommandLine.h> // rose
#include <Diagnostics.h> // rose
#include <Partitioner2/Engine.h> // rose
#include <Partitioner2/Partitioner.h> // rose

#include <batSupport.h>
#include <boost/filesystem.hpp>
#include <boost/format.hpp>
#include <Sawyer/CommandLine.h>

using namespace Bat;
using namespace Rose;
using namespace Rose::BinaryAnalysis;
using namespace Sawyer::Message::Common;
namespace P2 = Rose::BinaryAnalysis::Partitioner2;

static Sawyer::Message::Facility mlog;
static SerialIo::Format stateFormat = SerialIo::BINARY;
static boost::filesystem::path saveAs, augmentFrom;

static std::vector<boost::filesystem::path>
parseCommandLine(int argc, char *argv[]) {
using namespace Sawyer::CommandLine;

SwitchGroup generic = Rose::CommandLine::genericSwitches();
generic.insert(Bat::stateFileFormatSwitch(stateFormat));
generic.insert(Switch("output", 'o')
.argument("file", anyParser(saveAs))
.doc("Save output in the specified file, overwriting the file if it already existed. The default "
"is to print the output as a textual table to standard output."));

generic.insert(Switch("input", 'i')
.argument("file", anyParser(augmentFrom))
.doc("Initialize the histogram with data from the specified file. This can be used to accumulate "
"instruction frequencies across multiple specimens."));

Parser parser = Rose::CommandLine::createEmptyParser(purpose, description);
parser.errorStream(mlog[FATAL]);
parser.with(generic);
parser.doc("Synopsis", "@prop{programName} [@v{switches}] [@v{rba_files}...]");

std::vector<std::string> input = parser.parse(argc, argv).apply().unreachedArgs();
return std::vector<boost::filesystem::path>(input.begin(), input.end());
}

int
main(int argc, char *argv[]) {
ROSE_INITIALIZE;
Diagnostics::initAndRegister(&mlog, "tool");
mlog.comment("instruction frequencies");
Bat::checkRoseVersionNumber(MINIMUM_ROSE_LIBRARY_VERSION, mlog[FATAL]);
Bat::registerSelfTests();
std::vector<boost::filesystem::path> rbaFiles = parseCommandLine(argc, argv);

// Initialize the histogram.
InsnHistogram histogram;
if (!augmentFrom.empty()) {
if (!boost::filesystem::exists(augmentFrom) && augmentFrom == saveAs) {
// As a special case, it's not an error if the augment-from file does not exist but would be created as the output
// of this tool. This is so that this tool can be used in a shell "for" loop like:
// rm result.dat
// for f in *.rba; do
// bat-insnfreq -a result.dat -o result.dat "$f"
// done
} else {
try {
histogram = loadInsnHistogram(augmentFrom);
} catch (const SerialIo::Exception &e) {
mlog[FATAL] <<e.what() <<"\n";
exit(1);
}
}
}

// Compute the histogram
for (const boost::filesystem::path &rbaFile: rbaFiles) {
P2::Engine engine;
P2::Partitioner partitioner = engine.loadPartitioner(rbaFile, stateFormat);
MemoryMap::Ptr map = partitioner.memoryMap();
ASSERT_not_null(map);
mergeInsnHistogram(histogram, computeInsnHistogram(partitioner.instructionProvider(), map));
}

// Emit results
if (!saveAs.empty()) {
saveInsnHistogram(histogram, saveAs);
} else {
printInsnHistogram(histogram, std::cout);
}
}
103 changes: 103 additions & 0 deletions tools/BinaryAnalysis/batSupport.C
Original file line number Diff line number Diff line change
Expand Up @@ -432,4 +432,107 @@ PathSelector::maybeTerminate() const {
_exit(0);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Instruction histograms
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#if __cplusplus >= 201402L

InsnHistogram
computeInsnHistogram(const InstructionProvider &insns, const MemoryMap::Ptr &map) {
InsnHistogram histogram;
rose_addr_t va = 0;
while (map->atOrAfter(va).require(MemoryMap::EXECUTABLE).next().assignTo(va)) {
const rose_addr_t aligned = alignUp(va, insns.instructionAlignment());
if (va != aligned) {
va = aligned;
} else if (SgAsmInstruction *insn = insns[va]) {
++histogram[insn->get_mnemonic()];
va += insn->get_size();
} else {
++va;
}
}
return histogram;
}

void
saveInsnHistogram(const InsnHistogram &histogram, const boost::filesystem::path &fileName) {
auto io = SerialOutput::instance();
io->format(SerialIo::XML);
io->open(fileName);
io->saveObject(SerialIo::USER_DEFINED, histogram);
}

InsnHistogram
loadInsnHistogram(const boost::filesystem::path &fileName) {
auto io = SerialInput::instance();
io->format(SerialIo::XML);
io->open(fileName);
return io->loadObject<InsnHistogram>(SerialIo::USER_DEFINED);
}

std::vector<InsnHistogram>
splitInsnHistogram(const InsnHistogram &histogram, size_t nParts) {
ASSERT_require(nParts > 0);
std::vector<std::pair<std::string, size_t>> records(histogram.begin(), histogram.end());
std::sort(records.begin(), records.end(), [](auto &a, auto &b) { return a.second > b.second; });
size_t partSize = (records.size() + nParts - 1) / nParts;
std::vector<InsnHistogram> parts(nParts);
for (size_t i = 0; i < records.size(); ++i)
parts[i/partSize].insert(records[i]);
return parts;
}

void
mergeInsnHistogram(InsnHistogram &histogram, const InsnHistogram &other) {
for (auto &pair: other)
histogram[pair.first] += pair.second;
}

double
compareInsnHistograms(const std::vector<InsnHistogram> &aParts, const InsnHistogram &b) {
std::vector<InsnHistogram> bParts = splitInsnHistogram(b, aParts.size());
size_t totalDiff = 0, maxDiff = 0;
for (size_t i = 0; i < aParts.size(); ++i) {
maxDiff += std::max(i - 0, (bParts.size()-1) - i) * aParts[i].size();
for (const auto &record: aParts[i]) {
const std::string &label = record.first;
size_t foundAt = bParts.size() - 1;
for (size_t j = 0; j < bParts.size(); ++j) {
if (bParts[j].find(label) != bParts[j].end()) {
foundAt = j;
break;
}
}
size_t diff = std::max(i, foundAt) - std::min(i, foundAt);
totalDiff += diff;
}
}
return 1.0 * totalDiff / maxDiff;
}

double
compareInsnHistograms(const InsnHistogram &a, const InsnHistogram &b, size_t nParts) {
std::vector<InsnHistogram> aParts = splitInsnHistogram(a, nParts);
return compareInsnHistograms(aParts, b);
}

void
printInsnHistogram(const InsnHistogram &histogram, std::ostream &out) {
size_t runningTotal = 0, grandTotal = 0;
for (auto pair: histogram)
grandTotal += pair.second;
std::vector<std::pair<std::string, size_t>> results(histogram.begin(), histogram.end());
std::sort(results.begin(), results.end(), [](auto &a, auto &b) { return a.second > b.second; });
std::cout <<"Instruction N N% Total Total%\n";
for (auto pair: results) {
runningTotal += pair.second;
out <<(boost::format("%-15s\t%7d\t%7.3f\t%7d\t%7.3f\n")
%pair.first %pair.second %(100.0 * pair.second / grandTotal)
%runningTotal %(100.0 * runningTotal / grandTotal));
}
}

#endif

} // namespace
Loading

0 comments on commit 3a3423c

Please sign in to comment.