135 lines
5.1 KiB

  1. #include <iostream>
  2. #include <Eigen/Core>
  3. #include <bench/BenchTimer.h>
  4. using namespace Eigen;
  5. #ifndef SIZE
  6. #define SIZE 50
  7. #endif
  8. #ifndef REPEAT
  9. #define REPEAT 10000
  10. #endif
  11. typedef float Scalar;
  12. __attribute__ ((noinline)) void benchVec(Scalar* a, Scalar* b, Scalar* c, int size);
  13. __attribute__ ((noinline)) void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c);
  14. __attribute__ ((noinline)) void benchVec(VectorXf& a, VectorXf& b, VectorXf& c);
  15. int main(int argc, char* argv[])
  16. {
  17. int size = SIZE * 8;
  18. int size2 = size * size;
  19. Scalar* a = internal::aligned_new<Scalar>(size2);
  20. Scalar* b = internal::aligned_new<Scalar>(size2+4)+1;
  21. Scalar* c = internal::aligned_new<Scalar>(size2);
  22. for (int i=0; i<size; ++i)
  23. {
  24. a[i] = b[i] = c[i] = 0;
  25. }
  26. BenchTimer timer;
  27. timer.reset();
  28. for (int k=0; k<10; ++k)
  29. {
  30. timer.start();
  31. benchVec(a, b, c, size2);
  32. timer.stop();
  33. }
  34. std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
  35. return 0;
  36. for (int innersize = size; innersize>2 ; --innersize)
  37. {
  38. if (size2%innersize==0)
  39. {
  40. int outersize = size2/innersize;
  41. MatrixXf ma = Map<MatrixXf>(a, innersize, outersize );
  42. MatrixXf mb = Map<MatrixXf>(b, innersize, outersize );
  43. MatrixXf mc = Map<MatrixXf>(c, innersize, outersize );
  44. timer.reset();
  45. for (int k=0; k<3; ++k)
  46. {
  47. timer.start();
  48. benchVec(ma, mb, mc);
  49. timer.stop();
  50. }
  51. std::cout << innersize << " x " << outersize << " " << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
  52. }
  53. }
  54. VectorXf va = Map<VectorXf>(a, size2);
  55. VectorXf vb = Map<VectorXf>(b, size2);
  56. VectorXf vc = Map<VectorXf>(c, size2);
  57. timer.reset();
  58. for (int k=0; k<3; ++k)
  59. {
  60. timer.start();
  61. benchVec(va, vb, vc);
  62. timer.stop();
  63. }
  64. std::cout << timer.value() << "s " << (double(size2*REPEAT)/timer.value())/(1024.*1024.*1024.) << " GFlops\n";
  65. return 0;
  66. }
  67. void benchVec(MatrixXf& a, MatrixXf& b, MatrixXf& c)
  68. {
  69. for (int k=0; k<REPEAT; ++k)
  70. a = a + b;
  71. }
  72. void benchVec(VectorXf& a, VectorXf& b, VectorXf& c)
  73. {
  74. for (int k=0; k<REPEAT; ++k)
  75. a = a + b;
  76. }
  77. void benchVec(Scalar* a, Scalar* b, Scalar* c, int size)
  78. {
  79. typedef internal::packet_traits<Scalar>::type PacketScalar;
  80. const int PacketSize = internal::packet_traits<Scalar>::size;
  81. PacketScalar a0, a1, a2, a3, b0, b1, b2, b3;
  82. for (int k=0; k<REPEAT; ++k)
  83. for (int i=0; i<size; i+=PacketSize*8)
  84. {
  85. // a0 = internal::pload(&a[i]);
  86. // b0 = internal::pload(&b[i]);
  87. // a1 = internal::pload(&a[i+1*PacketSize]);
  88. // b1 = internal::pload(&b[i+1*PacketSize]);
  89. // a2 = internal::pload(&a[i+2*PacketSize]);
  90. // b2 = internal::pload(&b[i+2*PacketSize]);
  91. // a3 = internal::pload(&a[i+3*PacketSize]);
  92. // b3 = internal::pload(&b[i+3*PacketSize]);
  93. // internal::pstore(&a[i], internal::padd(a0, b0));
  94. // a0 = internal::pload(&a[i+4*PacketSize]);
  95. // b0 = internal::pload(&b[i+4*PacketSize]);
  96. //
  97. // internal::pstore(&a[i+1*PacketSize], internal::padd(a1, b1));
  98. // a1 = internal::pload(&a[i+5*PacketSize]);
  99. // b1 = internal::pload(&b[i+5*PacketSize]);
  100. //
  101. // internal::pstore(&a[i+2*PacketSize], internal::padd(a2, b2));
  102. // a2 = internal::pload(&a[i+6*PacketSize]);
  103. // b2 = internal::pload(&b[i+6*PacketSize]);
  104. //
  105. // internal::pstore(&a[i+3*PacketSize], internal::padd(a3, b3));
  106. // a3 = internal::pload(&a[i+7*PacketSize]);
  107. // b3 = internal::pload(&b[i+7*PacketSize]);
  108. //
  109. // internal::pstore(&a[i+4*PacketSize], internal::padd(a0, b0));
  110. // internal::pstore(&a[i+5*PacketSize], internal::padd(a1, b1));
  111. // internal::pstore(&a[i+6*PacketSize], internal::padd(a2, b2));
  112. // internal::pstore(&a[i+7*PacketSize], internal::padd(a3, b3));
  113. internal::pstore(&a[i+2*PacketSize], internal::padd(internal::ploadu(&a[i+2*PacketSize]), internal::ploadu(&b[i+2*PacketSize])));
  114. internal::pstore(&a[i+3*PacketSize], internal::padd(internal::ploadu(&a[i+3*PacketSize]), internal::ploadu(&b[i+3*PacketSize])));
  115. internal::pstore(&a[i+4*PacketSize], internal::padd(internal::ploadu(&a[i+4*PacketSize]), internal::ploadu(&b[i+4*PacketSize])));
  116. internal::pstore(&a[i+5*PacketSize], internal::padd(internal::ploadu(&a[i+5*PacketSize]), internal::ploadu(&b[i+5*PacketSize])));
  117. internal::pstore(&a[i+6*PacketSize], internal::padd(internal::ploadu(&a[i+6*PacketSize]), internal::ploadu(&b[i+6*PacketSize])));
  118. internal::pstore(&a[i+7*PacketSize], internal::padd(internal::ploadu(&a[i+7*PacketSize]), internal::ploadu(&b[i+7*PacketSize])));
  119. }
  120. }