You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

677 lines
22 KiB

  1. // This file is part of Eigen, a lightweight C++ template library
  2. // for linear algebra.
  3. //
  4. // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
  5. //
  6. // This Source Code Form is subject to the terms of the Mozilla
  7. // Public License v. 2.0. If a copy of the MPL was not distributed
  8. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. #include <iostream>
  10. #include <cstdint>
  11. #include <cstdlib>
  12. #include <vector>
  13. #include <fstream>
  14. #include <memory>
  15. #include <cstdio>
  16. bool eigen_use_specific_block_size;
  17. int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
  18. #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
  19. #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
  20. #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
  21. #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
  22. #include <Eigen/Core>
  23. #include <bench/BenchTimer.h>
  24. using namespace StormEigen;
  25. using namespace std;
  26. static BenchTimer timer;
  27. // how many times we repeat each measurement.
  28. // measurements are randomly shuffled - we're not doing
  29. // all N identical measurements in a row.
  30. const int measurement_repetitions = 3;
  31. // Timings below this value are too short to be accurate,
  32. // we'll repeat measurements with more iterations until
  33. // we get a timing above that threshold.
  34. const float min_accurate_time = 1e-2f;
  35. // See --min-working-set-size command line parameter.
  36. size_t min_working_set_size = 0;
  37. float max_clock_speed = 0.0f;
  38. // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
  39. const size_t maxsize = 2048;
  40. const size_t minsize = 16;
  41. typedef MatrixXf MatrixType;
  42. typedef MatrixType::Scalar Scalar;
  43. typedef internal::packet_traits<Scalar>::type Packet;
  44. static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
  45. static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
  46. static_assert(maxsize > minsize, "maxsize must be larger than minsize");
  47. static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
  48. // just a helper to store a triple of K,M,N sizes for matrix product
  49. struct size_triple_t
  50. {
  51. size_t k, m, n;
  52. size_triple_t() : k(0), m(0), n(0) {}
  53. size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
  54. size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
  55. size_triple_t(uint16_t compact)
  56. {
  57. k = 1 << ((compact & 0xf00) >> 8);
  58. m = 1 << ((compact & 0x0f0) >> 4);
  59. n = 1 << ((compact & 0x00f) >> 0);
  60. }
  61. };
  62. uint8_t log2_pot(size_t x) {
  63. size_t l = 0;
  64. while (x >>= 1) l++;
  65. return l;
  66. }
  67. // Convert between size tripes and a compact form fitting in 12 bits
  68. // where each size, which must be a POT, is encoded as its log2, on 4 bits
  69. // so the largest representable size is 2^15 == 32k ... big enough.
  70. uint16_t compact_size_triple(size_t k, size_t m, size_t n)
  71. {
  72. return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
  73. }
  74. uint16_t compact_size_triple(const size_triple_t& t)
  75. {
  76. return compact_size_triple(t.k, t.m, t.n);
  77. }
  78. // A single benchmark. Initially only contains benchmark params.
  79. // Then call run(), which stores the result in the gflops field.
  80. struct benchmark_t
  81. {
  82. uint16_t compact_product_size;
  83. uint16_t compact_block_size;
  84. bool use_default_block_size;
  85. float gflops;
  86. benchmark_t()
  87. : compact_product_size(0)
  88. , compact_block_size(0)
  89. , use_default_block_size(false)
  90. , gflops(0)
  91. {
  92. }
  93. benchmark_t(size_t pk, size_t pm, size_t pn,
  94. size_t bk, size_t bm, size_t bn)
  95. : compact_product_size(compact_size_triple(pk, pm, pn))
  96. , compact_block_size(compact_size_triple(bk, bm, bn))
  97. , use_default_block_size(false)
  98. , gflops(0)
  99. {}
  100. benchmark_t(size_t pk, size_t pm, size_t pn)
  101. : compact_product_size(compact_size_triple(pk, pm, pn))
  102. , compact_block_size(0)
  103. , use_default_block_size(true)
  104. , gflops(0)
  105. {}
  106. void run();
  107. };
  108. ostream& operator<<(ostream& s, const benchmark_t& b)
  109. {
  110. s << hex << b.compact_product_size << dec;
  111. if (b.use_default_block_size) {
  112. size_triple_t t(b.compact_product_size);
  113. Index k = t.k, m = t.m, n = t.n;
  114. internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
  115. s << " default(" << k << ", " << m << ", " << n << ")";
  116. } else {
  117. s << " " << hex << b.compact_block_size << dec;
  118. }
  119. s << " " << b.gflops;
  120. return s;
  121. }
  122. // We sort first by increasing benchmark parameters,
  123. // then by decreasing performance.
  124. bool operator<(const benchmark_t& b1, const benchmark_t& b2)
  125. {
  126. return b1.compact_product_size < b2.compact_product_size ||
  127. (b1.compact_product_size == b2.compact_product_size && (
  128. (b1.compact_block_size < b2.compact_block_size || (
  129. b1.compact_block_size == b2.compact_block_size &&
  130. b1.gflops > b2.gflops))));
  131. }
  132. void benchmark_t::run()
  133. {
  134. size_triple_t productsizes(compact_product_size);
  135. if (use_default_block_size) {
  136. eigen_use_specific_block_size = false;
  137. } else {
  138. // feed eigen with our custom blocking params
  139. eigen_use_specific_block_size = true;
  140. size_triple_t blocksizes(compact_block_size);
  141. eigen_block_size_k = blocksizes.k;
  142. eigen_block_size_m = blocksizes.m;
  143. eigen_block_size_n = blocksizes.n;
  144. }
  145. // set up the matrix pool
  146. const size_t combined_three_matrices_sizes =
  147. sizeof(Scalar) *
  148. (productsizes.k * productsizes.m +
  149. productsizes.k * productsizes.n +
  150. productsizes.m * productsizes.n);
  151. // 64 M is large enough that nobody has a cache bigger than that,
  152. // while still being small enough that everybody has this much RAM,
  153. // so conveniently we don't need to special-case platforms here.
  154. const size_t unlikely_large_cache_size = 64 << 20;
  155. const size_t working_set_size =
  156. min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
  157. const size_t matrix_pool_size =
  158. 1 + working_set_size / combined_three_matrices_sizes;
  159. MatrixType *lhs = new MatrixType[matrix_pool_size];
  160. MatrixType *rhs = new MatrixType[matrix_pool_size];
  161. MatrixType *dst = new MatrixType[matrix_pool_size];
  162. for (size_t i = 0; i < matrix_pool_size; i++) {
  163. lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
  164. rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
  165. dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
  166. }
  167. // main benchmark loop
  168. int iters_at_a_time = 1;
  169. float time_per_iter = 0.0f;
  170. size_t matrix_index = 0;
  171. while (true) {
  172. double starttime = timer.getCpuTime();
  173. for (int i = 0; i < iters_at_a_time; i++) {
  174. dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
  175. matrix_index++;
  176. if (matrix_index == matrix_pool_size) {
  177. matrix_index = 0;
  178. }
  179. }
  180. double endtime = timer.getCpuTime();
  181. const float timing = float(endtime - starttime);
  182. if (timing >= min_accurate_time) {
  183. time_per_iter = timing / iters_at_a_time;
  184. break;
  185. }
  186. iters_at_a_time *= 2;
  187. }
  188. delete[] lhs;
  189. delete[] rhs;
  190. delete[] dst;
  191. gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
  192. }
  193. void print_cpuinfo()
  194. {
  195. #ifdef __linux__
  196. cout << "contents of /proc/cpuinfo:" << endl;
  197. string line;
  198. ifstream cpuinfo("/proc/cpuinfo");
  199. if (cpuinfo.is_open()) {
  200. while (getline(cpuinfo, line)) {
  201. cout << line << endl;
  202. }
  203. cpuinfo.close();
  204. }
  205. cout << endl;
  206. #elif defined __APPLE__
  207. cout << "output of sysctl hw:" << endl;
  208. system("sysctl hw");
  209. cout << endl;
  210. #endif
  211. }
  212. template <typename T>
  213. string type_name()
  214. {
  215. return "unknown";
  216. }
  217. template<>
  218. string type_name<float>()
  219. {
  220. return "float";
  221. }
  222. template<>
  223. string type_name<double>()
  224. {
  225. return "double";
  226. }
  227. struct action_t
  228. {
  229. virtual const char* invokation_name() const { abort(); return nullptr; }
  230. virtual void run() const { abort(); }
  231. virtual ~action_t() {}
  232. };
  233. void show_usage_and_exit(int /*argc*/, char* argv[],
  234. const vector<unique_ptr<action_t>>& available_actions)
  235. {
  236. cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
  237. cerr << "available actions:" << endl << endl;
  238. for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
  239. cerr << " " << (*it)->invokation_name() << endl;
  240. }
  241. cerr << endl;
  242. cerr << "options:" << endl << endl;
  243. cerr << " --min-working-set-size=N:" << endl;
  244. cerr << " Set the minimum working set size to N bytes." << endl;
  245. cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
  246. cerr << " A larger working set lowers the chance of a warm cache." << endl;
  247. cerr << " The default value 0 means use a large enough working" << endl;
  248. cerr << " set to likely outsize caches." << endl;
  249. cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
  250. cerr << " avoid warm caches." << endl;
  251. exit(1);
  252. }
  253. float measure_clock_speed()
  254. {
  255. cerr << "Measuring clock speed... \r" << flush;
  256. vector<float> all_gflops;
  257. for (int i = 0; i < 8; i++) {
  258. benchmark_t b(1024, 1024, 1024);
  259. b.run();
  260. all_gflops.push_back(b.gflops);
  261. }
  262. sort(all_gflops.begin(), all_gflops.end());
  263. float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
  264. // multiply by an arbitrary constant to discourage trying doing anything with the
  265. // returned values besides just comparing them with each other.
  266. float result = stable_estimate * 123.456f;
  267. return result;
  268. }
  269. struct human_duration_t
  270. {
  271. int seconds;
  272. human_duration_t(int s) : seconds(s) {}
  273. };
  274. ostream& operator<<(ostream& s, const human_duration_t& d)
  275. {
  276. int remainder = d.seconds;
  277. if (remainder > 3600) {
  278. int hours = remainder / 3600;
  279. s << hours << " h ";
  280. remainder -= hours * 3600;
  281. }
  282. if (remainder > 60) {
  283. int minutes = remainder / 60;
  284. s << minutes << " min ";
  285. remainder -= minutes * 60;
  286. }
  287. if (d.seconds < 600) {
  288. s << remainder << " s";
  289. }
  290. return s;
  291. }
  292. const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
  293. void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
  294. {
  295. FILE* file = fopen(filename, "w");
  296. if (!file) {
  297. cerr << "Could not open file " << filename << " for writing." << endl;
  298. cerr << "Do you have write permissions on the current working directory?" << endl;
  299. exit(1);
  300. }
  301. size_t benchmarks_vector_size = benchmarks.size();
  302. fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
  303. fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
  304. fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
  305. fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
  306. fclose(file);
  307. }
  308. bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
  309. {
  310. FILE* file = fopen(filename, "r");
  311. if (!file) {
  312. return false;
  313. }
  314. if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
  315. return false;
  316. }
  317. size_t benchmarks_vector_size = 0;
  318. if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
  319. return false;
  320. }
  321. if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
  322. return false;
  323. }
  324. benchmarks.resize(benchmarks_vector_size);
  325. if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
  326. return false;
  327. }
  328. unlink(filename);
  329. return true;
  330. }
  331. void try_run_some_benchmarks(
  332. vector<benchmark_t>& benchmarks,
  333. double time_start,
  334. size_t& first_benchmark_to_run)
  335. {
  336. if (first_benchmark_to_run == benchmarks.size()) {
  337. return;
  338. }
  339. double time_last_progress_update = 0;
  340. double time_last_clock_speed_measurement = 0;
  341. double time_now = 0;
  342. size_t benchmark_index = first_benchmark_to_run;
  343. while (true) {
  344. float ratio_done = float(benchmark_index) / benchmarks.size();
  345. time_now = timer.getRealTime();
  346. // We check clock speed every minute and at the end.
  347. if (benchmark_index == benchmarks.size() ||
  348. time_now > time_last_clock_speed_measurement + 60.0f)
  349. {
  350. time_last_clock_speed_measurement = time_now;
  351. // Ensure that clock speed is as expected
  352. float current_clock_speed = measure_clock_speed();
  353. // The tolerance needs to be smaller than the relative difference between
  354. // clock speeds that a device could operate under.
  355. // It seems unlikely that a device would be throttling clock speeds by
  356. // amounts smaller than 2%.
  357. // With a value of 1%, I was getting within noise on a Sandy Bridge.
  358. const float clock_speed_tolerance = 0.02f;
  359. if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
  360. // Clock speed is now higher than we previously measured.
  361. // Either our initial measurement was inaccurate, which won't happen
  362. // too many times as we are keeping the best clock speed value and
  363. // and allowing some tolerance; or something really weird happened,
  364. // which invalidates all benchmark results collected so far.
  365. // Either way, we better restart all over again now.
  366. if (benchmark_index) {
  367. cerr << "Restarting at " << 100.0f * ratio_done
  368. << " % because clock speed increased. " << endl;
  369. }
  370. max_clock_speed = current_clock_speed;
  371. first_benchmark_to_run = 0;
  372. return;
  373. }
  374. bool rerun_last_tests = false;
  375. if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
  376. cerr << "Measurements completed so far: "
  377. << 100.0f * ratio_done
  378. << " % " << endl;
  379. cerr << "Clock speed seems to be only "
  380. << current_clock_speed/max_clock_speed
  381. << " times what it used to be." << endl;
  382. unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
  383. while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
  384. if (seconds_to_sleep_if_lower_clock_speed > 32) {
  385. cerr << "Sleeping longer probably won't make a difference." << endl;
  386. cerr << "Serializing benchmarks to " << session_filename << endl;
  387. serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
  388. cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
  389. exit(2);
  390. }
  391. rerun_last_tests = true;
  392. cerr << "Sleeping "
  393. << seconds_to_sleep_if_lower_clock_speed
  394. << " s... \r" << endl;
  395. sleep(seconds_to_sleep_if_lower_clock_speed);
  396. current_clock_speed = measure_clock_speed();
  397. seconds_to_sleep_if_lower_clock_speed *= 2;
  398. }
  399. }
  400. if (rerun_last_tests) {
  401. cerr << "Redoing the last "
  402. << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
  403. << " % because clock speed had been low. " << endl;
  404. return;
  405. }
  406. // nothing wrong with the clock speed so far, so there won't be a need to rerun
  407. // benchmarks run so far in case we later encounter a lower clock speed.
  408. first_benchmark_to_run = benchmark_index;
  409. }
  410. if (benchmark_index == benchmarks.size()) {
  411. // We're done!
  412. first_benchmark_to_run = benchmarks.size();
  413. // Erase progress info
  414. cerr << " " << endl;
  415. return;
  416. }
  417. // Display progress info on stderr
  418. if (time_now > time_last_progress_update + 1.0f) {
  419. time_last_progress_update = time_now;
  420. cerr << "Measurements... " << 100.0f * ratio_done
  421. << " %, ETA "
  422. << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
  423. << " \r" << flush;
  424. }
  425. // This is where we actually run a benchmark!
  426. benchmarks[benchmark_index].run();
  427. benchmark_index++;
  428. }
  429. }
  430. void run_benchmarks(vector<benchmark_t>& benchmarks)
  431. {
  432. size_t first_benchmark_to_run;
  433. vector<benchmark_t> deserialized_benchmarks;
  434. bool use_deserialized_benchmarks = false;
  435. if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
  436. cerr << "Found serialized session with "
  437. << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
  438. << " % already done" << endl;
  439. if (deserialized_benchmarks.size() == benchmarks.size() &&
  440. first_benchmark_to_run > 0 &&
  441. first_benchmark_to_run < benchmarks.size())
  442. {
  443. use_deserialized_benchmarks = true;
  444. }
  445. }
  446. if (use_deserialized_benchmarks) {
  447. benchmarks = deserialized_benchmarks;
  448. } else {
  449. // not using deserialized benchmarks, starting from scratch
  450. first_benchmark_to_run = 0;
  451. // Randomly shuffling benchmarks allows us to get accurate enough progress info,
  452. // as now the cheap/expensive benchmarks are randomly mixed so they average out.
  453. // It also means that if data is corrupted for some time span, the odds are that
  454. // not all repetitions of a given benchmark will be corrupted.
  455. random_shuffle(benchmarks.begin(), benchmarks.end());
  456. }
  457. for (int i = 0; i < 4; i++) {
  458. max_clock_speed = max(max_clock_speed, measure_clock_speed());
  459. }
  460. double time_start = 0.0;
  461. while (first_benchmark_to_run < benchmarks.size()) {
  462. if (first_benchmark_to_run == 0) {
  463. time_start = timer.getRealTime();
  464. }
  465. try_run_some_benchmarks(benchmarks,
  466. time_start,
  467. first_benchmark_to_run);
  468. }
  469. // Sort timings by increasing benchmark parameters, and decreasing gflops.
  470. // The latter is very important. It means that we can ignore all but the first
  471. // benchmark with given parameters.
  472. sort(benchmarks.begin(), benchmarks.end());
  473. // Collect best (i.e. now first) results for each parameter values.
  474. vector<benchmark_t> best_benchmarks;
  475. for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
  476. if (best_benchmarks.empty() ||
  477. best_benchmarks.back().compact_product_size != it->compact_product_size ||
  478. best_benchmarks.back().compact_block_size != it->compact_block_size)
  479. {
  480. best_benchmarks.push_back(*it);
  481. }
  482. }
  483. // keep and return only the best benchmarks
  484. benchmarks = best_benchmarks;
  485. }
  486. struct measure_all_pot_sizes_action_t : action_t
  487. {
  488. virtual const char* invokation_name() const { return "all-pot-sizes"; }
  489. virtual void run() const
  490. {
  491. vector<benchmark_t> benchmarks;
  492. for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
  493. for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
  494. for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
  495. for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
  496. for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
  497. for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
  498. for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
  499. benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
  500. }
  501. }
  502. }
  503. }
  504. }
  505. }
  506. }
  507. run_benchmarks(benchmarks);
  508. cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
  509. for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
  510. cout << *it << endl;
  511. }
  512. }
  513. };
  514. struct measure_default_sizes_action_t : action_t
  515. {
  516. virtual const char* invokation_name() const { return "default-sizes"; }
  517. virtual void run() const
  518. {
  519. vector<benchmark_t> benchmarks;
  520. for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
  521. for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
  522. for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
  523. for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
  524. benchmarks.emplace_back(ksize, msize, nsize);
  525. }
  526. }
  527. }
  528. }
  529. run_benchmarks(benchmarks);
  530. cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
  531. for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
  532. cout << *it << endl;
  533. }
  534. }
  535. };
  536. int main(int argc, char* argv[])
  537. {
  538. double time_start = timer.getRealTime();
  539. cout.precision(4);
  540. cerr.precision(4);
  541. vector<unique_ptr<action_t>> available_actions;
  542. available_actions.emplace_back(new measure_all_pot_sizes_action_t);
  543. available_actions.emplace_back(new measure_default_sizes_action_t);
  544. auto action = available_actions.end();
  545. if (argc <= 1) {
  546. show_usage_and_exit(argc, argv, available_actions);
  547. }
  548. for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
  549. if (!strcmp(argv[1], (*it)->invokation_name())) {
  550. action = it;
  551. break;
  552. }
  553. }
  554. if (action == available_actions.end()) {
  555. show_usage_and_exit(argc, argv, available_actions);
  556. }
  557. for (int i = 2; i < argc; i++) {
  558. if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
  559. const char* equals_sign = strchr(argv[i], '=');
  560. min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
  561. } else {
  562. cerr << "unrecognized option: " << argv[i] << endl << endl;
  563. show_usage_and_exit(argc, argv, available_actions);
  564. }
  565. }
  566. print_cpuinfo();
  567. cout << "benchmark parameters:" << endl;
  568. cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
  569. cout << "scalar type: " << type_name<Scalar>() << endl;
  570. cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
  571. cout << "minsize = " << minsize << endl;
  572. cout << "maxsize = " << maxsize << endl;
  573. cout << "measurement_repetitions = " << measurement_repetitions << endl;
  574. cout << "min_accurate_time = " << min_accurate_time << endl;
  575. cout << "min_working_set_size = " << min_working_set_size;
  576. if (min_working_set_size == 0) {
  577. cout << " (try to outsize caches)";
  578. }
  579. cout << endl << endl;
  580. (*action)->run();
  581. double time_end = timer.getRealTime();
  582. cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
  583. }