|
|
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <fstream>
#include <memory>
#include <cstdio>
bool eigen_use_specific_block_size; int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n; #define STORMEIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
#define STORMEIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
#define STORMEIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
#define STORMEIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
#include <StormEigen/Core>
#include <bench/BenchTimer.h>
using namespace StormEigen; using namespace std;
static BenchTimer timer;
// how many times we repeat each measurement.
// measurements are randomly shuffled - we're not doing
// all N identical measurements in a row.
const int measurement_repetitions = 3;
// Timings below this value are too short to be accurate,
// we'll repeat measurements with more iterations until
// we get a timing above that threshold.
const float min_accurate_time = 1e-2f;
// See --min-working-set-size command line parameter.
size_t min_working_set_size = 0;
float max_clock_speed = 0.0f;
// range of sizes that we will benchmark (in all 3 K,M,N dimensions)
const size_t maxsize = 2048; const size_t minsize = 16;
typedef MatrixXf MatrixType; typedef MatrixType::Scalar Scalar; typedef internal::packet_traits<Scalar>::type Packet;
static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two"); static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two"); static_assert(maxsize > minsize, "maxsize must be larger than minsize"); static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
// just a helper to store a triple of K,M,N sizes for matrix product
struct size_triple_t { size_t k, m, n; size_triple_t() : k(0), m(0), n(0) {} size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {} size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {} size_triple_t(uint16_t compact) { k = 1 << ((compact & 0xf00) >> 8); m = 1 << ((compact & 0x0f0) >> 4); n = 1 << ((compact & 0x00f) >> 0); } };
uint8_t log2_pot(size_t x) { size_t l = 0; while (x >>= 1) l++; return l; }
// Convert between size tripes and a compact form fitting in 12 bits
// where each size, which must be a POT, is encoded as its log2, on 4 bits
// so the largest representable size is 2^15 == 32k ... big enough.
uint16_t compact_size_triple(size_t k, size_t m, size_t n) { return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n); }
uint16_t compact_size_triple(const size_triple_t& t) { return compact_size_triple(t.k, t.m, t.n); }
// A single benchmark. Initially only contains benchmark params.
// Then call run(), which stores the result in the gflops field.
struct benchmark_t { uint16_t compact_product_size; uint16_t compact_block_size; bool use_default_block_size; float gflops; benchmark_t() : compact_product_size(0) , compact_block_size(0) , use_default_block_size(false) , gflops(0) { } benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(compact_size_triple(bk, bm, bn)) , use_default_block_size(false) , gflops(0) {} benchmark_t(size_t pk, size_t pm, size_t pn) : compact_product_size(compact_size_triple(pk, pm, pn)) , compact_block_size(0) , use_default_block_size(true) , gflops(0) {}
void run(); };
ostream& operator<<(ostream& s, const benchmark_t& b) { s << hex << b.compact_product_size << dec; if (b.use_default_block_size) { size_triple_t t(b.compact_product_size); Index k = t.k, m = t.m, n = t.n; internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n); s << " default(" << k << ", " << m << ", " << n << ")"; } else { s << " " << hex << b.compact_block_size << dec; } s << " " << b.gflops; return s; }
// We sort first by increasing benchmark parameters,
// then by decreasing performance.
bool operator<(const benchmark_t& b1, const benchmark_t& b2) { return b1.compact_product_size < b2.compact_product_size || (b1.compact_product_size == b2.compact_product_size && ( (b1.compact_block_size < b2.compact_block_size || ( b1.compact_block_size == b2.compact_block_size && b1.gflops > b2.gflops)))); }
void benchmark_t::run() { size_triple_t productsizes(compact_product_size);
if (use_default_block_size) { eigen_use_specific_block_size = false; } else { // feed eigen with our custom blocking params
eigen_use_specific_block_size = true; size_triple_t blocksizes(compact_block_size); eigen_block_size_k = blocksizes.k; eigen_block_size_m = blocksizes.m; eigen_block_size_n = blocksizes.n; }
// set up the matrix pool
const size_t combined_three_matrices_sizes = sizeof(Scalar) * (productsizes.k * productsizes.m + productsizes.k * productsizes.n + productsizes.m * productsizes.n);
// 64 M is large enough that nobody has a cache bigger than that,
// while still being small enough that everybody has this much RAM,
// so conveniently we don't need to special-case platforms here.
const size_t unlikely_large_cache_size = 64 << 20;
const size_t working_set_size = min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
const size_t matrix_pool_size = 1 + working_set_size / combined_three_matrices_sizes;
MatrixType *lhs = new MatrixType[matrix_pool_size]; MatrixType *rhs = new MatrixType[matrix_pool_size]; MatrixType *dst = new MatrixType[matrix_pool_size]; for (size_t i = 0; i < matrix_pool_size; i++) { lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k); rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n); dst[i] = MatrixType::Zero(productsizes.m, productsizes.n); }
// main benchmark loop
int iters_at_a_time = 1; float time_per_iter = 0.0f; size_t matrix_index = 0; while (true) {
double starttime = timer.getCpuTime(); for (int i = 0; i < iters_at_a_time; i++) { dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; matrix_index++; if (matrix_index == matrix_pool_size) { matrix_index = 0; } } double endtime = timer.getCpuTime();
const float timing = float(endtime - starttime);
if (timing >= min_accurate_time) { time_per_iter = timing / iters_at_a_time; break; }
iters_at_a_time *= 2; }
delete[] lhs; delete[] rhs; delete[] dst;
gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter; }
void print_cpuinfo() { #ifdef __linux__
cout << "contents of /proc/cpuinfo:" << endl; string line; ifstream cpuinfo("/proc/cpuinfo"); if (cpuinfo.is_open()) { while (getline(cpuinfo, line)) { cout << line << endl; } cpuinfo.close(); } cout << endl; #elif defined __APPLE__
cout << "output of sysctl hw:" << endl; system("sysctl hw"); cout << endl; #endif
}
template <typename T> string type_name() { return "unknown"; }
template<> string type_name<float>() { return "float"; }
template<> string type_name<double>() { return "double"; }
struct action_t { virtual const char* invokation_name() const { abort(); return nullptr; } virtual void run() const { abort(); } virtual ~action_t() {} };
void show_usage_and_exit(int /*argc*/, char* argv[], const vector<unique_ptr<action_t>>& available_actions) { cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl; cerr << "available actions:" << endl << endl; for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { cerr << " " << (*it)->invokation_name() << endl; } cerr << endl; cerr << "options:" << endl << endl; cerr << " --min-working-set-size=N:" << endl; cerr << " Set the minimum working set size to N bytes." << endl; cerr << " This is rounded up as needed to a multiple of matrix size." << endl; cerr << " A larger working set lowers the chance of a warm cache." << endl; cerr << " The default value 0 means use a large enough working" << endl; cerr << " set to likely outsize caches." << endl; cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl; cerr << " avoid warm caches." << endl; exit(1); } float measure_clock_speed() { cerr << "Measuring clock speed... \r" << flush; vector<float> all_gflops; for (int i = 0; i < 8; i++) { benchmark_t b(1024, 1024, 1024); b.run(); all_gflops.push_back(b.gflops); }
sort(all_gflops.begin(), all_gflops.end()); float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
// multiply by an arbitrary constant to discourage trying doing anything with the
// returned values besides just comparing them with each other.
float result = stable_estimate * 123.456f;
return result; }
struct human_duration_t { int seconds; human_duration_t(int s) : seconds(s) {} };
ostream& operator<<(ostream& s, const human_duration_t& d) { int remainder = d.seconds; if (remainder > 3600) { int hours = remainder / 3600; s << hours << " h "; remainder -= hours * 3600; } if (remainder > 60) { int minutes = remainder / 60; s << minutes << " min "; remainder -= minutes * 60; } if (d.seconds < 600) { s << remainder << " s"; } return s; }
const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) { FILE* file = fopen(filename, "w"); if (!file) { cerr << "Could not open file " << filename << " for writing." << endl; cerr << "Do you have write permissions on the current working directory?" << endl; exit(1); } size_t benchmarks_vector_size = benchmarks.size(); fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file); fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file); fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file); fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file); fclose(file); }
bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) { FILE* file = fopen(filename, "r"); if (!file) { return false; } if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) { return false; } size_t benchmarks_vector_size = 0; if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) { return false; } if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) { return false; } benchmarks.resize(benchmarks_vector_size); if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) { return false; } unlink(filename); return true; }
void try_run_some_benchmarks( vector<benchmark_t>& benchmarks, double time_start, size_t& first_benchmark_to_run) { if (first_benchmark_to_run == benchmarks.size()) { return; }
double time_last_progress_update = 0; double time_last_clock_speed_measurement = 0; double time_now = 0;
size_t benchmark_index = first_benchmark_to_run;
while (true) { float ratio_done = float(benchmark_index) / benchmarks.size(); time_now = timer.getRealTime();
// We check clock speed every minute and at the end.
if (benchmark_index == benchmarks.size() || time_now > time_last_clock_speed_measurement + 60.0f) { time_last_clock_speed_measurement = time_now;
// Ensure that clock speed is as expected
float current_clock_speed = measure_clock_speed();
// The tolerance needs to be smaller than the relative difference between
// clock speeds that a device could operate under.
// It seems unlikely that a device would be throttling clock speeds by
// amounts smaller than 2%.
// With a value of 1%, I was getting within noise on a Sandy Bridge.
const float clock_speed_tolerance = 0.02f;
if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) { // Clock speed is now higher than we previously measured.
// Either our initial measurement was inaccurate, which won't happen
// too many times as we are keeping the best clock speed value and
// and allowing some tolerance; or something really weird happened,
// which invalidates all benchmark results collected so far.
// Either way, we better restart all over again now.
if (benchmark_index) { cerr << "Restarting at " << 100.0f * ratio_done << " % because clock speed increased. " << endl; } max_clock_speed = current_clock_speed; first_benchmark_to_run = 0; return; }
bool rerun_last_tests = false;
if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { cerr << "Measurements completed so far: " << 100.0f * ratio_done << " % " << endl; cerr << "Clock speed seems to be only " << current_clock_speed/max_clock_speed << " times what it used to be." << endl;
unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) { if (seconds_to_sleep_if_lower_clock_speed > 32) { cerr << "Sleeping longer probably won't make a difference." << endl; cerr << "Serializing benchmarks to " << session_filename << endl; serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run); cerr << "Now restart this benchmark, and it should pick up where we left." << endl; exit(2); } rerun_last_tests = true; cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed << " s... \r" << endl; sleep(seconds_to_sleep_if_lower_clock_speed); current_clock_speed = measure_clock_speed(); seconds_to_sleep_if_lower_clock_speed *= 2; } }
if (rerun_last_tests) { cerr << "Redoing the last " << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size() << " % because clock speed had been low. " << endl; return; }
// nothing wrong with the clock speed so far, so there won't be a need to rerun
// benchmarks run so far in case we later encounter a lower clock speed.
first_benchmark_to_run = benchmark_index; }
if (benchmark_index == benchmarks.size()) { // We're done!
first_benchmark_to_run = benchmarks.size(); // Erase progress info
cerr << " " << endl; return; }
// Display progress info on stderr
if (time_now > time_last_progress_update + 1.0f) { time_last_progress_update = time_now; cerr << "Measurements... " << 100.0f * ratio_done << " %, ETA " << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done) << " \r" << flush; }
// This is where we actually run a benchmark!
benchmarks[benchmark_index].run(); benchmark_index++; } }
void run_benchmarks(vector<benchmark_t>& benchmarks) { size_t first_benchmark_to_run; vector<benchmark_t> deserialized_benchmarks; bool use_deserialized_benchmarks = false; if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) { cerr << "Found serialized session with " << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size() << " % already done" << endl; if (deserialized_benchmarks.size() == benchmarks.size() && first_benchmark_to_run > 0 && first_benchmark_to_run < benchmarks.size()) { use_deserialized_benchmarks = true; } }
if (use_deserialized_benchmarks) { benchmarks = deserialized_benchmarks; } else { // not using deserialized benchmarks, starting from scratch
first_benchmark_to_run = 0;
// Randomly shuffling benchmarks allows us to get accurate enough progress info,
// as now the cheap/expensive benchmarks are randomly mixed so they average out.
// It also means that if data is corrupted for some time span, the odds are that
// not all repetitions of a given benchmark will be corrupted.
random_shuffle(benchmarks.begin(), benchmarks.end()); }
for (int i = 0; i < 4; i++) { max_clock_speed = max(max_clock_speed, measure_clock_speed()); } double time_start = 0.0; while (first_benchmark_to_run < benchmarks.size()) { if (first_benchmark_to_run == 0) { time_start = timer.getRealTime(); } try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run); }
// Sort timings by increasing benchmark parameters, and decreasing gflops.
// The latter is very important. It means that we can ignore all but the first
// benchmark with given parameters.
sort(benchmarks.begin(), benchmarks.end());
// Collect best (i.e. now first) results for each parameter values.
vector<benchmark_t> best_benchmarks; for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { if (best_benchmarks.empty() || best_benchmarks.back().compact_product_size != it->compact_product_size || best_benchmarks.back().compact_block_size != it->compact_block_size) { best_benchmarks.push_back(*it); } }
// keep and return only the best benchmarks
benchmarks = best_benchmarks; }
struct measure_all_pot_sizes_action_t : action_t { virtual const char* invokation_name() const { return "all-pot-sizes"; } virtual void run() const { vector<benchmark_t> benchmarks; for (int repetition = 0; repetition < measurement_repetitions; repetition++) { for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { for (size_t msize = minsize; msize <= maxsize; msize *= 2) { for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) { for (size_t mblock = minsize; mblock <= msize; mblock *= 2) { for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) { benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock); } } } } } } }
run_benchmarks(benchmarks);
cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl; for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { cout << *it << endl; } } };
struct measure_default_sizes_action_t : action_t { virtual const char* invokation_name() const { return "default-sizes"; } virtual void run() const { vector<benchmark_t> benchmarks; for (int repetition = 0; repetition < measurement_repetitions; repetition++) { for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) { for (size_t msize = minsize; msize <= maxsize; msize *= 2) { for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) { benchmarks.emplace_back(ksize, msize, nsize); } } } }
run_benchmarks(benchmarks);
cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl; for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) { cout << *it << endl; } } };
int main(int argc, char* argv[]) { double time_start = timer.getRealTime(); cout.precision(4); cerr.precision(4);
vector<unique_ptr<action_t>> available_actions; available_actions.emplace_back(new measure_all_pot_sizes_action_t); available_actions.emplace_back(new measure_default_sizes_action_t);
auto action = available_actions.end();
if (argc <= 1) { show_usage_and_exit(argc, argv, available_actions); } for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { if (!strcmp(argv[1], (*it)->invokation_name())) { action = it; break; } }
if (action == available_actions.end()) { show_usage_and_exit(argc, argv, available_actions); }
for (int i = 2; i < argc; i++) { if (argv[i] == strstr(argv[i], "--min-working-set-size=")) { const char* equals_sign = strchr(argv[i], '='); min_working_set_size = strtoul(equals_sign+1, nullptr, 10); } else { cerr << "unrecognized option: " << argv[i] << endl << endl; show_usage_and_exit(argc, argv, available_actions); } }
print_cpuinfo();
cout << "benchmark parameters:" << endl; cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl; cout << "scalar type: " << type_name<Scalar>() << endl; cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl; cout << "minsize = " << minsize << endl; cout << "maxsize = " << maxsize << endl; cout << "measurement_repetitions = " << measurement_repetitions << endl; cout << "min_accurate_time = " << min_accurate_time << endl; cout << "min_working_set_size = " << min_working_set_size; if (min_working_set_size == 0) { cout << " (try to outsize caches)"; } cout << endl << endl;
(*action)->run();
double time_end = timer.getRealTime(); cerr << "Finished in " << human_duration_t(time_end - time_start) << endl; }
|