You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

360 lines
11 KiB

  1. #include <typeinfo>
  2. #include <iostream>
  3. #include <Eigen/Core>
  4. #include "BenchTimer.h"
  5. using namespace StormEigen;
  6. using namespace std;
  7. template<typename T>
  8. EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(T& v)
  9. {
  10. return v.norm();
  11. }
  12. template<typename T>
  13. EIGEN_DONT_INLINE typename T::Scalar stableNorm(T& v)
  14. {
  15. return v.stableNorm();
  16. }
  17. template<typename T>
  18. EIGEN_DONT_INLINE typename T::Scalar hypotNorm(T& v)
  19. {
  20. return v.hypotNorm();
  21. }
  22. template<typename T>
  23. EIGEN_DONT_INLINE typename T::Scalar blueNorm(T& v)
  24. {
  25. return v.blueNorm();
  26. }
  27. template<typename T>
  28. EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v)
  29. {
  30. typedef typename T::Scalar Scalar;
  31. int n = v.size();
  32. Scalar scale = 0;
  33. Scalar ssq = 1;
  34. for (int i=0;i<n;++i)
  35. {
  36. Scalar ax = std::abs(v.coeff(i));
  37. if (scale >= ax)
  38. {
  39. ssq += numext::abs2(ax/scale);
  40. }
  41. else
  42. {
  43. ssq = Scalar(1) + ssq * numext::abs2(scale/ax);
  44. scale = ax;
  45. }
  46. }
  47. return scale * std::sqrt(ssq);
  48. }
  49. template<typename T>
  50. EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v)
  51. {
  52. typedef typename T::Scalar Scalar;
  53. Scalar s = v.array().abs().maxCoeff();
  54. return s*(v/s).norm();
  55. }
  56. template<typename T>
  57. EIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v)
  58. {
  59. return v.stableNorm();
  60. }
  61. template<typename T>
  62. EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v)
  63. {
  64. int n =v.size() / 2;
  65. for (int i=0;i<n;++i)
  66. v(i) = v(2*i)*v(2*i) + v(2*i+1)*v(2*i+1);
  67. n = n/2;
  68. while (n>0)
  69. {
  70. for (int i=0;i<n;++i)
  71. v(i) = v(2*i) + v(2*i+1);
  72. n = n/2;
  73. }
  74. return std::sqrt(v(0));
  75. }
  76. namespace StormEigen {
  77. namespace internal {
  78. #ifdef EIGEN_VECTORIZE
  79. Packet4f plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
  80. Packet2d plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
  81. Packet4f pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
  82. Packet2d pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
  83. #endif
  84. }
  85. }
  86. template<typename T>
  87. EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
  88. {
  89. #ifndef EIGEN_VECTORIZE
  90. return v.blueNorm();
  91. #else
  92. typedef typename T::Scalar Scalar;
  93. static int nmax = 0;
  94. static Scalar b1, b2, s1m, s2m, overfl, rbig, relerr;
  95. int n;
  96. if(nmax <= 0)
  97. {
  98. int nbig, ibeta, it, iemin, iemax, iexp;
  99. Scalar abig, eps;
  100. nbig = std::numeric_limits<int>::max(); // largest integer
  101. ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base; // base for floating-point numbers
  102. it = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa; // number of base-beta digits in mantissa
  103. iemin = std::numeric_limits<Scalar>::min_exponent; // minimum exponent
  104. iemax = std::numeric_limits<Scalar>::max_exponent; // maximum exponent
  105. rbig = std::numeric_limits<Scalar>::max(); // largest floating-point number
  106. // Check the basic machine-dependent constants.
  107. if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
  108. || (it<=4 && ibeta <= 3 ) || it<2)
  109. {
  110. eigen_assert(false && "the algorithm cannot be guaranteed on this computer");
  111. }
  112. iexp = -((1-iemin)/2);
  113. b1 = std::pow(ibeta, iexp); // lower boundary of midrange
  114. iexp = (iemax + 1 - it)/2;
  115. b2 = std::pow(ibeta,iexp); // upper boundary of midrange
  116. iexp = (2-iemin)/2;
  117. s1m = std::pow(ibeta,iexp); // scaling factor for lower range
  118. iexp = - ((iemax+it)/2);
  119. s2m = std::pow(ibeta,iexp); // scaling factor for upper range
  120. overfl = rbig*s2m; // overfow boundary for abig
  121. eps = std::pow(ibeta, 1-it);
  122. relerr = std::sqrt(eps); // tolerance for neglecting asml
  123. abig = 1.0/eps - 1.0;
  124. if (Scalar(nbig)>abig) nmax = abig; // largest safe n
  125. else nmax = nbig;
  126. }
  127. typedef typename internal::packet_traits<Scalar>::type Packet;
  128. const int ps = internal::packet_traits<Scalar>::size;
  129. Packet pasml = internal::pset1<Packet>(Scalar(0));
  130. Packet pamed = internal::pset1<Packet>(Scalar(0));
  131. Packet pabig = internal::pset1<Packet>(Scalar(0));
  132. Packet ps2m = internal::pset1<Packet>(s2m);
  133. Packet ps1m = internal::pset1<Packet>(s1m);
  134. Packet pb2 = internal::pset1<Packet>(b2);
  135. Packet pb1 = internal::pset1<Packet>(b1);
  136. for(int j=0; j<v.size(); j+=ps)
  137. {
  138. Packet ax = internal::pabs(v.template packet<Aligned>(j));
  139. Packet ax_s2m = internal::pmul(ax,ps2m);
  140. Packet ax_s1m = internal::pmul(ax,ps1m);
  141. Packet maskBig = internal::plt(pb2,ax);
  142. Packet maskSml = internal::plt(ax,pb1);
  143. // Packet maskMed = internal::pand(maskSml,maskBig);
  144. // Packet scale = internal::pset1(Scalar(0));
  145. // scale = internal::por(scale, internal::pand(maskBig,ps2m));
  146. // scale = internal::por(scale, internal::pand(maskSml,ps1m));
  147. // scale = internal::por(scale, internal::pandnot(internal::pset1(Scalar(1)),maskMed));
  148. // ax = internal::pmul(ax,scale);
  149. // ax = internal::pmul(ax,ax);
  150. // pabig = internal::padd(pabig, internal::pand(maskBig, ax));
  151. // pasml = internal::padd(pasml, internal::pand(maskSml, ax));
  152. // pamed = internal::padd(pamed, internal::pandnot(ax,maskMed));
  153. pabig = internal::padd(pabig, internal::pand(maskBig, internal::pmul(ax_s2m,ax_s2m)));
  154. pasml = internal::padd(pasml, internal::pand(maskSml, internal::pmul(ax_s1m,ax_s1m)));
  155. pamed = internal::padd(pamed, internal::pandnot(internal::pmul(ax,ax),internal::pand(maskSml,maskBig)));
  156. }
  157. Scalar abig = internal::predux(pabig);
  158. Scalar asml = internal::predux(pasml);
  159. Scalar amed = internal::predux(pamed);
  160. if(abig > Scalar(0))
  161. {
  162. abig = std::sqrt(abig);
  163. if(abig > overfl)
  164. {
  165. eigen_assert(false && "overflow");
  166. return rbig;
  167. }
  168. if(amed > Scalar(0))
  169. {
  170. abig = abig/s2m;
  171. amed = std::sqrt(amed);
  172. }
  173. else
  174. {
  175. return abig/s2m;
  176. }
  177. }
  178. else if(asml > Scalar(0))
  179. {
  180. if (amed > Scalar(0))
  181. {
  182. abig = std::sqrt(amed);
  183. amed = std::sqrt(asml) / s1m;
  184. }
  185. else
  186. {
  187. return std::sqrt(asml)/s1m;
  188. }
  189. }
  190. else
  191. {
  192. return std::sqrt(amed);
  193. }
  194. asml = std::min(abig, amed);
  195. abig = std::max(abig, amed);
  196. if(asml <= abig*relerr)
  197. return abig;
  198. else
  199. return abig * std::sqrt(Scalar(1) + numext::abs2(asml/abig));
  200. #endif
  201. }
  202. #define BENCH_PERF(NRM) { \
  203. float af = 0; double ad = 0; std::complex<float> ac = 0; \
  204. StormEigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
  205. for (int k=0; k<tries; ++k) { \
  206. tf.start(); \
  207. for (int i=0; i<iters; ++i) { af += NRM(vf); } \
  208. tf.stop(); \
  209. } \
  210. for (int k=0; k<tries; ++k) { \
  211. td.start(); \
  212. for (int i=0; i<iters; ++i) { ad += NRM(vd); } \
  213. td.stop(); \
  214. } \
  215. /*for (int k=0; k<std::max(1,tries/3); ++k) { \
  216. tcf.start(); \
  217. for (int i=0; i<iters; ++i) { ac += NRM(vcf); } \
  218. tcf.stop(); \
  219. } */\
  220. std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \
  221. }
  222. void check_accuracy(double basef, double based, int s)
  223. {
  224. double yf = basef * std::abs(internal::random<double>());
  225. double yd = based * std::abs(internal::random<double>());
  226. VectorXf vf = VectorXf::Ones(s) * yf;
  227. VectorXd vd = VectorXd::Ones(s) * yd;
  228. std::cout << "reference\t" << std::sqrt(double(s))*yf << "\t" << std::sqrt(double(s))*yd << "\n";
  229. std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n";
  230. std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n";
  231. std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n";
  232. std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\n";
  233. std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\n";
  234. std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\n";
  235. std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\n";
  236. }
  237. void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s)
  238. {
  239. VectorXf vf(s);
  240. VectorXd vd(s);
  241. for (int i=0; i<s; ++i)
  242. {
  243. vf[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
  244. vd[i] = std::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
  245. }
  246. //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
  247. std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\t" << sqsumNorm(vf.cast<long double>()) << "\t" << sqsumNorm(vd.cast<long double>()) << "\n";
  248. std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\t" << hypotNorm(vf.cast<long double>()) << "\t" << hypotNorm(vd.cast<long double>()) << "\n";
  249. std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
  250. std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
  251. std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast<long double>()) << "\t" << lapackNorm(vd.cast<long double>()) << "\n";
  252. std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast<long double>()) << "\t" << twopassNorm(vd.cast<long double>()) << "\n";
  253. // std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast<long double>()) << "\t" << bl2passNorm(vd.cast<long double>()) << "\n";
  254. }
  255. int main(int argc, char** argv)
  256. {
  257. int tries = 10;
  258. int iters = 100000;
  259. double y = 1.1345743233455785456788e12 * internal::random<double>();
  260. VectorXf v = VectorXf::Ones(1024) * y;
  261. // return 0;
  262. int s = 10000;
  263. double basef_ok = 1.1345743233455785456788e15;
  264. double based_ok = 1.1345743233455785456788e95;
  265. double basef_under = 1.1345743233455785456788e-27;
  266. double based_under = 1.1345743233455785456788e-303;
  267. double basef_over = 1.1345743233455785456788e+27;
  268. double based_over = 1.1345743233455785456788e+302;
  269. std::cout.precision(20);
  270. std::cerr << "\nNo under/overflow:\n";
  271. check_accuracy(basef_ok, based_ok, s);
  272. std::cerr << "\nUnderflow:\n";
  273. check_accuracy(basef_under, based_under, s);
  274. std::cerr << "\nOverflow:\n";
  275. check_accuracy(basef_over, based_over, s);
  276. std::cerr << "\nVarying (over):\n";
  277. for (int k=0; k<1; ++k)
  278. {
  279. check_accuracy_var(20,27,190,302,s);
  280. std::cout << "\n";
  281. }
  282. std::cerr << "\nVarying (under):\n";
  283. for (int k=0; k<1; ++k)
  284. {
  285. check_accuracy_var(-27,20,-302,-190,s);
  286. std::cout << "\n";
  287. }
  288. y = 1;
  289. std::cout.precision(4);
  290. int s1 = 1024*1024*32;
  291. std::cerr << "Performance (out of cache, " << s1 << "):\n";
  292. {
  293. int iters = 1;
  294. VectorXf vf = VectorXf::Random(s1) * y;
  295. VectorXd vd = VectorXd::Random(s1) * y;
  296. VectorXcf vcf = VectorXcf::Random(s1) * y;
  297. BENCH_PERF(sqsumNorm);
  298. BENCH_PERF(stableNorm);
  299. BENCH_PERF(blueNorm);
  300. BENCH_PERF(pblueNorm);
  301. BENCH_PERF(lapackNorm);
  302. BENCH_PERF(hypotNorm);
  303. BENCH_PERF(twopassNorm);
  304. BENCH_PERF(bl2passNorm);
  305. }
  306. std::cerr << "\nPerformance (in cache, " << 512 << "):\n";
  307. {
  308. int iters = 100000;
  309. VectorXf vf = VectorXf::Random(512) * y;
  310. VectorXd vd = VectorXd::Random(512) * y;
  311. VectorXcf vcf = VectorXcf::Random(512) * y;
  312. BENCH_PERF(sqsumNorm);
  313. BENCH_PERF(stableNorm);
  314. BENCH_PERF(blueNorm);
  315. BENCH_PERF(pblueNorm);
  316. BENCH_PERF(lapackNorm);
  317. BENCH_PERF(hypotNorm);
  318. BENCH_PERF(twopassNorm);
  319. BENCH_PERF(bl2passNorm);
  320. }
  321. }