618 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			618 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // This file is part of Eigen, a lightweight C++ template library
 | |
| // for linear algebra.
 | |
| //
 | |
| // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
 | |
| //
 | |
| // This Source Code Form is subject to the terms of the Mozilla
 | |
| // Public License v. 2.0. If a copy of the MPL was not distributed
 | |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 | |
| 
 | |
| #include <iostream>
 | |
| #include <cstdint>
 | |
| #include <cstdlib>
 | |
| #include <vector>
 | |
| #include <fstream>
 | |
| #include <memory>
 | |
| #include <cstdio>
 | |
| 
 | |
| bool eigen_use_specific_block_size;
 | |
| int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
 | |
| #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
 | |
| #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
 | |
| #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
 | |
| #define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
 | |
| #include <Eigen/Core>
 | |
| 
 | |
| #include <bench/BenchTimer.h>
 | |
| 
 | |
| using namespace Eigen;
 | |
| using namespace std;
 | |
| 
 | |
| static BenchTimer timer;
 | |
| 
 | |
| // how many times we repeat each measurement.
 | |
| // measurements are randomly shuffled - we're not doing
 | |
| // all N identical measurements in a row.
 | |
| const int measurement_repetitions = 3;
 | |
| 
 | |
| // Timings below this value are too short to be accurate,
 | |
| // we'll repeat measurements with more iterations until
 | |
| // we get a timing above that threshold.
 | |
| const float min_accurate_time = 1e-2f;
 | |
| 
 | |
| // See --min-working-set-size command line parameter.
 | |
| size_t min_working_set_size = 0;
 | |
| 
 | |
| float max_clock_speed = 0.0f;
 | |
| 
 | |
| // range of sizes that we will benchmark (in all 3 K,M,N dimensions)
 | |
| const size_t maxsize = 2048;
 | |
| const size_t minsize = 16;
 | |
| 
 | |
| typedef MatrixXf MatrixType;
 | |
| typedef MatrixType::Scalar Scalar;
 | |
| typedef internal::packet_traits<Scalar>::type Packet;
 | |
| 
 | |
| static_assert((maxsize & (maxsize - 1)) == 0, "maxsize must be a power of two");
 | |
| static_assert((minsize & (minsize - 1)) == 0, "minsize must be a power of two");
 | |
| static_assert(maxsize > minsize, "maxsize must be larger than minsize");
 | |
| static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");
 | |
| 
 | |
| // just a helper to store a triple of K,M,N sizes for matrix product
 | |
| struct size_triple_t {
 | |
|   size_t k, m, n;
 | |
|   size_triple_t() : k(0), m(0), n(0) {}
 | |
|   size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
 | |
|   size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
 | |
|   size_triple_t(uint16_t compact) {
 | |
|     k = 1 << ((compact & 0xf00) >> 8);
 | |
|     m = 1 << ((compact & 0x0f0) >> 4);
 | |
|     n = 1 << ((compact & 0x00f) >> 0);
 | |
|   }
 | |
| };
 | |
| 
 | |
| uint8_t log2_pot(size_t x) {
 | |
|   size_t l = 0;
 | |
|   while (x >>= 1) l++;
 | |
|   return l;
 | |
| }
 | |
| 
 | |
| // Convert between size tripes and a compact form fitting in 12 bits
 | |
| // where each size, which must be a POT, is encoded as its log2, on 4 bits
 | |
| // so the largest representable size is 2^15 == 32k  ... big enough.
 | |
| uint16_t compact_size_triple(size_t k, size_t m, size_t n) {
 | |
|   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
 | |
| }
 | |
| 
 | |
| uint16_t compact_size_triple(const size_triple_t& t) { return compact_size_triple(t.k, t.m, t.n); }
 | |
| 
 | |
| // A single benchmark. Initially only contains benchmark params.
 | |
| // Then call run(), which stores the result in the gflops field.
 | |
| struct benchmark_t {
 | |
|   uint16_t compact_product_size;
 | |
|   uint16_t compact_block_size;
 | |
|   bool use_default_block_size;
 | |
|   float gflops;
 | |
|   benchmark_t() : compact_product_size(0), compact_block_size(0), use_default_block_size(false), gflops(0) {}
 | |
|   benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn)
 | |
|       : compact_product_size(compact_size_triple(pk, pm, pn)),
 | |
|         compact_block_size(compact_size_triple(bk, bm, bn)),
 | |
|         use_default_block_size(false),
 | |
|         gflops(0) {}
 | |
|   benchmark_t(size_t pk, size_t pm, size_t pn)
 | |
|       : compact_product_size(compact_size_triple(pk, pm, pn)),
 | |
|         compact_block_size(0),
 | |
|         use_default_block_size(true),
 | |
|         gflops(0) {}
 | |
| 
 | |
|   void run();
 | |
| };
 | |
| 
 | |
| ostream& operator<<(ostream& s, const benchmark_t& b) {
 | |
|   s << hex << b.compact_product_size << dec;
 | |
|   if (b.use_default_block_size) {
 | |
|     size_triple_t t(b.compact_product_size);
 | |
|     Index k = t.k, m = t.m, n = t.n;
 | |
|     internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
 | |
|     s << " default(" << k << ", " << m << ", " << n << ")";
 | |
|   } else {
 | |
|     s << " " << hex << b.compact_block_size << dec;
 | |
|   }
 | |
|   s << " " << b.gflops;
 | |
|   return s;
 | |
| }
 | |
| 
 | |
| // We sort first by increasing benchmark parameters,
 | |
| // then by decreasing performance.
 | |
| bool operator<(const benchmark_t& b1, const benchmark_t& b2) {
 | |
|   return b1.compact_product_size < b2.compact_product_size ||
 | |
|          (b1.compact_product_size == b2.compact_product_size &&
 | |
|           ((b1.compact_block_size < b2.compact_block_size ||
 | |
|             (b1.compact_block_size == b2.compact_block_size && b1.gflops > b2.gflops))));
 | |
| }
 | |
| 
 | |
| void benchmark_t::run() {
 | |
|   size_triple_t productsizes(compact_product_size);
 | |
| 
 | |
|   if (use_default_block_size) {
 | |
|     eigen_use_specific_block_size = false;
 | |
|   } else {
 | |
|     // feed eigen with our custom blocking params
 | |
|     eigen_use_specific_block_size = true;
 | |
|     size_triple_t blocksizes(compact_block_size);
 | |
|     eigen_block_size_k = blocksizes.k;
 | |
|     eigen_block_size_m = blocksizes.m;
 | |
|     eigen_block_size_n = blocksizes.n;
 | |
|   }
 | |
| 
 | |
|   // set up the matrix pool
 | |
| 
 | |
|   const size_t combined_three_matrices_sizes =
 | |
|       sizeof(Scalar) *
 | |
|       (productsizes.k * productsizes.m + productsizes.k * productsizes.n + productsizes.m * productsizes.n);
 | |
| 
 | |
|   // 64 M is large enough that nobody has a cache bigger than that,
 | |
|   // while still being small enough that everybody has this much RAM,
 | |
|   // so conveniently we don't need to special-case platforms here.
 | |
|   const size_t unlikely_large_cache_size = 64 << 20;
 | |
| 
 | |
|   const size_t working_set_size = min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
 | |
| 
 | |
|   const size_t matrix_pool_size = 1 + working_set_size / combined_three_matrices_sizes;
 | |
| 
 | |
|   MatrixType* lhs = new MatrixType[matrix_pool_size];
 | |
|   MatrixType* rhs = new MatrixType[matrix_pool_size];
 | |
|   MatrixType* dst = new MatrixType[matrix_pool_size];
 | |
| 
 | |
|   for (size_t i = 0; i < matrix_pool_size; i++) {
 | |
|     lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
 | |
|     rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
 | |
|     dst[i] = MatrixType::Zero(productsizes.m, productsizes.n);
 | |
|   }
 | |
| 
 | |
|   // main benchmark loop
 | |
| 
 | |
|   int iters_at_a_time = 1;
 | |
|   float time_per_iter = 0.0f;
 | |
|   size_t matrix_index = 0;
 | |
|   while (true) {
 | |
|     double starttime = timer.getCpuTime();
 | |
|     for (int i = 0; i < iters_at_a_time; i++) {
 | |
|       dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
 | |
|       matrix_index++;
 | |
|       if (matrix_index == matrix_pool_size) {
 | |
|         matrix_index = 0;
 | |
|       }
 | |
|     }
 | |
|     double endtime = timer.getCpuTime();
 | |
| 
 | |
|     const float timing = float(endtime - starttime);
 | |
| 
 | |
|     if (timing >= min_accurate_time) {
 | |
|       time_per_iter = timing / iters_at_a_time;
 | |
|       break;
 | |
|     }
 | |
| 
 | |
|     iters_at_a_time *= 2;
 | |
|   }
 | |
| 
 | |
|   delete[] lhs;
 | |
|   delete[] rhs;
 | |
|   delete[] dst;
 | |
| 
 | |
|   gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
 | |
| }
 | |
| 
 | |
| void print_cpuinfo() {
 | |
| #ifdef __linux__
 | |
|   cout << "contents of /proc/cpuinfo:" << endl;
 | |
|   string line;
 | |
|   ifstream cpuinfo("/proc/cpuinfo");
 | |
|   if (cpuinfo.is_open()) {
 | |
|     while (getline(cpuinfo, line)) {
 | |
|       cout << line << endl;
 | |
|     }
 | |
|     cpuinfo.close();
 | |
|   }
 | |
|   cout << endl;
 | |
| #elif defined __APPLE__
 | |
|   cout << "output of sysctl hw:" << endl;
 | |
|   system("sysctl hw");
 | |
|   cout << endl;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| template <typename T>
 | |
| string type_name() {
 | |
|   return "unknown";
 | |
| }
 | |
| 
 | |
| template <>
 | |
| string type_name<float>() {
 | |
|   return "float";
 | |
| }
 | |
| 
 | |
| template <>
 | |
| string type_name<double>() {
 | |
|   return "double";
 | |
| }
 | |
| 
 | |
| struct action_t {
 | |
|   virtual const char* invokation_name() const {
 | |
|     abort();
 | |
|     return nullptr;
 | |
|   }
 | |
|   virtual void run() const { abort(); }
 | |
|   virtual ~action_t() {}
 | |
| };
 | |
| 
 | |
| void show_usage_and_exit(int /*argc*/, char* argv[], const vector<unique_ptr<action_t>>& available_actions) {
 | |
|   cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
 | |
|   cerr << "available actions:" << endl << endl;
 | |
|   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
 | |
|     cerr << "  " << (*it)->invokation_name() << endl;
 | |
|   }
 | |
|   cerr << endl;
 | |
|   cerr << "options:" << endl << endl;
 | |
|   cerr << "  --min-working-set-size=N:" << endl;
 | |
|   cerr << "       Set the minimum working set size to N bytes." << endl;
 | |
|   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
 | |
|   cerr << "       A larger working set lowers the chance of a warm cache." << endl;
 | |
|   cerr << "       The default value 0 means use a large enough working" << endl;
 | |
|   cerr << "       set to likely outsize caches." << endl;
 | |
|   cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
 | |
|   cerr << "       avoid warm caches." << endl;
 | |
|   exit(1);
 | |
| }
 | |
| 
 | |
| float measure_clock_speed() {
 | |
|   cerr << "Measuring clock speed...                              \r" << flush;
 | |
| 
 | |
|   vector<float> all_gflops;
 | |
|   for (int i = 0; i < 8; i++) {
 | |
|     benchmark_t b(1024, 1024, 1024);
 | |
|     b.run();
 | |
|     all_gflops.push_back(b.gflops);
 | |
|   }
 | |
| 
 | |
|   sort(all_gflops.begin(), all_gflops.end());
 | |
|   float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
 | |
| 
 | |
|   // multiply by an arbitrary constant to discourage trying doing anything with the
 | |
|   // returned values besides just comparing them with each other.
 | |
|   float result = stable_estimate * 123.456f;
 | |
| 
 | |
|   return result;
 | |
| }
 | |
| 
 | |
| struct human_duration_t {
 | |
|   int seconds;
 | |
|   human_duration_t(int s) : seconds(s) {}
 | |
| };
 | |
| 
 | |
| ostream& operator<<(ostream& s, const human_duration_t& d) {
 | |
|   int remainder = d.seconds;
 | |
|   if (remainder > 3600) {
 | |
|     int hours = remainder / 3600;
 | |
|     s << hours << " h ";
 | |
|     remainder -= hours * 3600;
 | |
|   }
 | |
|   if (remainder > 60) {
 | |
|     int minutes = remainder / 60;
 | |
|     s << minutes << " min ";
 | |
|     remainder -= minutes * 60;
 | |
|   }
 | |
|   if (d.seconds < 600) {
 | |
|     s << remainder << " s";
 | |
|   }
 | |
|   return s;
 | |
| }
 | |
| 
 | |
| const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";
 | |
| 
 | |
| void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) {
 | |
|   FILE* file = fopen(filename, "w");
 | |
|   if (!file) {
 | |
|     cerr << "Could not open file " << filename << " for writing." << endl;
 | |
|     cerr << "Do you have write permissions on the current working directory?" << endl;
 | |
|     exit(1);
 | |
|   }
 | |
|   size_t benchmarks_vector_size = benchmarks.size();
 | |
|   fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
 | |
|   fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
 | |
|   fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
 | |
|   fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
 | |
|   fclose(file);
 | |
| }
 | |
| 
 | |
| bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) {
 | |
|   FILE* file = fopen(filename, "r");
 | |
|   if (!file) {
 | |
|     return false;
 | |
|   }
 | |
|   if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
 | |
|     return false;
 | |
|   }
 | |
|   size_t benchmarks_vector_size = 0;
 | |
|   if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
 | |
|     return false;
 | |
|   }
 | |
|   if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
 | |
|     return false;
 | |
|   }
 | |
|   benchmarks.resize(benchmarks_vector_size);
 | |
|   if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
 | |
|     return false;
 | |
|   }
 | |
|   unlink(filename);
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| void try_run_some_benchmarks(vector<benchmark_t>& benchmarks, double time_start, size_t& first_benchmark_to_run) {
 | |
|   if (first_benchmark_to_run == benchmarks.size()) {
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   double time_last_progress_update = 0;
 | |
|   double time_last_clock_speed_measurement = 0;
 | |
|   double time_now = 0;
 | |
| 
 | |
|   size_t benchmark_index = first_benchmark_to_run;
 | |
| 
 | |
|   while (true) {
 | |
|     float ratio_done = float(benchmark_index) / benchmarks.size();
 | |
|     time_now = timer.getRealTime();
 | |
| 
 | |
|     // We check clock speed every minute and at the end.
 | |
|     if (benchmark_index == benchmarks.size() || time_now > time_last_clock_speed_measurement + 60.0f) {
 | |
|       time_last_clock_speed_measurement = time_now;
 | |
| 
 | |
|       // Ensure that clock speed is as expected
 | |
|       float current_clock_speed = measure_clock_speed();
 | |
| 
 | |
|       // The tolerance needs to be smaller than the relative difference between
 | |
|       // clock speeds that a device could operate under.
 | |
|       // It seems unlikely that a device would be throttling clock speeds by
 | |
|       // amounts smaller than 2%.
 | |
|       // With a value of 1%, I was getting within noise on a Sandy Bridge.
 | |
|       const float clock_speed_tolerance = 0.02f;
 | |
| 
 | |
|       if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
 | |
|         // Clock speed is now higher than we previously measured.
 | |
|         // Either our initial measurement was inaccurate, which won't happen
 | |
|         // too many times as we are keeping the best clock speed value and
 | |
|         // and allowing some tolerance; or something really weird happened,
 | |
|         // which invalidates all benchmark results collected so far.
 | |
|         // Either way, we better restart all over again now.
 | |
|         if (benchmark_index) {
 | |
|           cerr << "Restarting at " << 100.0f * ratio_done << " % because clock speed increased.          " << endl;
 | |
|         }
 | |
|         max_clock_speed = current_clock_speed;
 | |
|         first_benchmark_to_run = 0;
 | |
|         return;
 | |
|       }
 | |
| 
 | |
|       bool rerun_last_tests = false;
 | |
| 
 | |
|       if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
 | |
|         cerr << "Measurements completed so far: " << 100.0f * ratio_done << " %                             " << endl;
 | |
|         cerr << "Clock speed seems to be only " << current_clock_speed / max_clock_speed << " times what it used to be."
 | |
|              << endl;
 | |
| 
 | |
|         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
 | |
| 
 | |
|         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
 | |
|           if (seconds_to_sleep_if_lower_clock_speed > 32) {
 | |
|             cerr << "Sleeping longer probably won't make a difference." << endl;
 | |
|             cerr << "Serializing benchmarks to " << session_filename << endl;
 | |
|             serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
 | |
|             cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
 | |
|             exit(2);
 | |
|           }
 | |
|           rerun_last_tests = true;
 | |
|           cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed << " s...                                   \r"
 | |
|                << endl;
 | |
|           sleep(seconds_to_sleep_if_lower_clock_speed);
 | |
|           current_clock_speed = measure_clock_speed();
 | |
|           seconds_to_sleep_if_lower_clock_speed *= 2;
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       if (rerun_last_tests) {
 | |
|         cerr << "Redoing the last " << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
 | |
|              << " % because clock speed had been low.   " << endl;
 | |
|         return;
 | |
|       }
 | |
| 
 | |
|       // nothing wrong with the clock speed so far, so there won't be a need to rerun
 | |
|       // benchmarks run so far in case we later encounter a lower clock speed.
 | |
|       first_benchmark_to_run = benchmark_index;
 | |
|     }
 | |
| 
 | |
|     if (benchmark_index == benchmarks.size()) {
 | |
|       // We're done!
 | |
|       first_benchmark_to_run = benchmarks.size();
 | |
|       // Erase progress info
 | |
|       cerr << "                                                            " << endl;
 | |
|       return;
 | |
|     }
 | |
| 
 | |
|     // Display progress info on stderr
 | |
|     if (time_now > time_last_progress_update + 1.0f) {
 | |
|       time_last_progress_update = time_now;
 | |
|       cerr << "Measurements... " << 100.0f * ratio_done << " %, ETA "
 | |
|            << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
 | |
|            << "                          \r" << flush;
 | |
|     }
 | |
| 
 | |
|     // This is where we actually run a benchmark!
 | |
|     benchmarks[benchmark_index].run();
 | |
|     benchmark_index++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| void run_benchmarks(vector<benchmark_t>& benchmarks) {
 | |
|   size_t first_benchmark_to_run;
 | |
|   vector<benchmark_t> deserialized_benchmarks;
 | |
|   bool use_deserialized_benchmarks = false;
 | |
|   if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
 | |
|     cerr << "Found serialized session with " << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
 | |
|          << " % already done" << endl;
 | |
|     if (deserialized_benchmarks.size() == benchmarks.size() && first_benchmark_to_run > 0 &&
 | |
|         first_benchmark_to_run < benchmarks.size()) {
 | |
|       use_deserialized_benchmarks = true;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (use_deserialized_benchmarks) {
 | |
|     benchmarks = deserialized_benchmarks;
 | |
|   } else {
 | |
|     // not using deserialized benchmarks, starting from scratch
 | |
|     first_benchmark_to_run = 0;
 | |
| 
 | |
|     // Randomly shuffling benchmarks allows us to get accurate enough progress info,
 | |
|     // as now the cheap/expensive benchmarks are randomly mixed so they average out.
 | |
|     // It also means that if data is corrupted for some time span, the odds are that
 | |
|     // not all repetitions of a given benchmark will be corrupted.
 | |
|     random_shuffle(benchmarks.begin(), benchmarks.end());
 | |
|   }
 | |
| 
 | |
|   for (int i = 0; i < 4; i++) {
 | |
|     max_clock_speed = max(max_clock_speed, measure_clock_speed());
 | |
|   }
 | |
| 
 | |
|   double time_start = 0.0;
 | |
|   while (first_benchmark_to_run < benchmarks.size()) {
 | |
|     if (first_benchmark_to_run == 0) {
 | |
|       time_start = timer.getRealTime();
 | |
|     }
 | |
|     try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run);
 | |
|   }
 | |
| 
 | |
|   // Sort timings by increasing benchmark parameters, and decreasing gflops.
 | |
|   // The latter is very important. It means that we can ignore all but the first
 | |
|   // benchmark with given parameters.
 | |
|   sort(benchmarks.begin(), benchmarks.end());
 | |
| 
 | |
|   // Collect best (i.e. now first) results for each parameter values.
 | |
|   vector<benchmark_t> best_benchmarks;
 | |
|   for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
 | |
|     if (best_benchmarks.empty() || best_benchmarks.back().compact_product_size != it->compact_product_size ||
 | |
|         best_benchmarks.back().compact_block_size != it->compact_block_size) {
 | |
|       best_benchmarks.push_back(*it);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // keep and return only the best benchmarks
 | |
|   benchmarks = best_benchmarks;
 | |
| }
 | |
| 
 | |
| struct measure_all_pot_sizes_action_t : action_t {
 | |
|   virtual const char* invokation_name() const { return "all-pot-sizes"; }
 | |
|   virtual void run() const {
 | |
|     vector<benchmark_t> benchmarks;
 | |
|     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
 | |
|       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
 | |
|         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
 | |
|           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
 | |
|             for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
 | |
|               for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
 | |
|                 for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
 | |
|                   benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
 | |
|                 }
 | |
|               }
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     run_benchmarks(benchmarks);
 | |
| 
 | |
|     cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
 | |
|     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
 | |
|       cout << *it << endl;
 | |
|     }
 | |
|   }
 | |
| };
 | |
| 
 | |
| struct measure_default_sizes_action_t : action_t {
 | |
|   virtual const char* invokation_name() const { return "default-sizes"; }
 | |
|   virtual void run() const {
 | |
|     vector<benchmark_t> benchmarks;
 | |
|     for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
 | |
|       for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
 | |
|         for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
 | |
|           for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
 | |
|             benchmarks.emplace_back(ksize, msize, nsize);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     run_benchmarks(benchmarks);
 | |
| 
 | |
|     cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
 | |
|     for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
 | |
|       cout << *it << endl;
 | |
|     }
 | |
|   }
 | |
| };
 | |
| 
 | |
| int main(int argc, char* argv[]) {
 | |
|   double time_start = timer.getRealTime();
 | |
|   cout.precision(4);
 | |
|   cerr.precision(4);
 | |
| 
 | |
|   vector<unique_ptr<action_t>> available_actions;
 | |
|   available_actions.emplace_back(new measure_all_pot_sizes_action_t);
 | |
|   available_actions.emplace_back(new measure_default_sizes_action_t);
 | |
| 
 | |
|   auto action = available_actions.end();
 | |
| 
 | |
|   if (argc <= 1) {
 | |
|     show_usage_and_exit(argc, argv, available_actions);
 | |
|   }
 | |
|   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
 | |
|     if (!strcmp(argv[1], (*it)->invokation_name())) {
 | |
|       action = it;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (action == available_actions.end()) {
 | |
|     show_usage_and_exit(argc, argv, available_actions);
 | |
|   }
 | |
| 
 | |
|   for (int i = 2; i < argc; i++) {
 | |
|     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
 | |
|       const char* equals_sign = strchr(argv[i], '=');
 | |
|       min_working_set_size = strtoul(equals_sign + 1, nullptr, 10);
 | |
|     } else {
 | |
|       cerr << "unrecognized option: " << argv[i] << endl << endl;
 | |
|       show_usage_and_exit(argc, argv, available_actions);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   print_cpuinfo();
 | |
| 
 | |
|   cout << "benchmark parameters:" << endl;
 | |
|   cout << "pointer size: " << 8 * sizeof(void*) << " bits" << endl;
 | |
|   cout << "scalar type: " << type_name<Scalar>() << endl;
 | |
|   cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
 | |
|   cout << "minsize = " << minsize << endl;
 | |
|   cout << "maxsize = " << maxsize << endl;
 | |
|   cout << "measurement_repetitions = " << measurement_repetitions << endl;
 | |
|   cout << "min_accurate_time = " << min_accurate_time << endl;
 | |
|   cout << "min_working_set_size = " << min_working_set_size;
 | |
|   if (min_working_set_size == 0) {
 | |
|     cout << " (try to outsize caches)";
 | |
|   }
 | |
|   cout << endl << endl;
 | |
| 
 | |
|   (*action)->run();
 | |
| 
 | |
|   double time_end = timer.getRealTime();
 | |
|   cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
 | |
| }
 | 
