You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

345 lines
11 KiB

  1. #include <typeinfo>
  2. #include <iostream>
  3. #include <Eigen/Core>
  4. #include "BenchTimer.h"
  5. using namespace Eigen;
  6. using namespace std;
  7. template<typename T>
  8. EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v)
  9. {
  10. return v.norm();
  11. }
  12. template<typename T>
  13. EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v)
  14. {
  15. return v.hypotNorm();
  16. }
  17. template<typename T>
  18. EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v)
  19. {
  20. return v.blueNorm();
  21. }
  22. template<typename T>
  23. EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v)
  24. {
  25. typedef typename T::Scalar Scalar;
  26. int n = v.size();
  27. Scalar scale = 0;
  28. Scalar ssq = 1;
  29. for (int i=0;i<n;++i)
  30. {
  31. Scalar ax = internal::abs(v.coeff(i));
  32. if (scale >= ax)
  33. {
  34. ssq += internal::abs2(ax/scale);
  35. }
  36. else
  37. {
  38. ssq = Scalar(1) + ssq * internal::abs2(scale/ax);
  39. scale = ax;
  40. }
  41. }
  42. return scale * internal::sqrt(ssq);
  43. }
  44. template<typename T>
  45. EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v)
  46. {
  47. typedef typename T::Scalar Scalar;
  48. Scalar s = v.cwise().abs().maxCoeff();
  49. return s*(v/s).norm();
  50. }
  51. template<typename T>
  52. EIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v)
  53. {
  54. return v.stableNorm();
  55. }
  56. template<typename T>
  57. EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v)
  58. {
  59. int n =v.size() / 2;
  60. for (int i=0;i<n;++i)
  61. v(i) = v(2*i)*v(2*i) + v(2*i+1)*v(2*i+1);
  62. n = n/2;
  63. while (n>0)
  64. {
  65. for (int i=0;i<n;++i)
  66. v(i) = v(2*i) + v(2*i+1);
  67. n = n/2;
  68. }
  69. return internal::sqrt(v(0));
  70. }
  71. #ifdef EIGEN_VECTORIZE
  72. Packet4f internal::plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); }
  73. Packet2d internal::plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); }
  74. Packet4f internal::pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); }
  75. Packet2d internal::pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); }
  76. #endif
  77. template<typename T>
  78. EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v)
  79. {
  80. #ifndef EIGEN_VECTORIZE
  81. return v.blueNorm();
  82. #else
  83. typedef typename T::Scalar Scalar;
  84. static int nmax = 0;
  85. static Scalar b1, b2, s1m, s2m, overfl, rbig, relerr;
  86. int n;
  87. if(nmax <= 0)
  88. {
  89. int nbig, ibeta, it, iemin, iemax, iexp;
  90. Scalar abig, eps;
  91. nbig = std::numeric_limits<int>::max(); // largest integer
  92. ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base; // base for floating-point numbers
  93. it = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa; // number of base-beta digits in mantissa
  94. iemin = std::numeric_limits<Scalar>::min_exponent; // minimum exponent
  95. iemax = std::numeric_limits<Scalar>::max_exponent; // maximum exponent
  96. rbig = std::numeric_limits<Scalar>::max(); // largest floating-point number
  97. // Check the basic machine-dependent constants.
  98. if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
  99. || (it<=4 && ibeta <= 3 ) || it<2)
  100. {
  101. eigen_assert(false && "the algorithm cannot be guaranteed on this computer");
  102. }
  103. iexp = -((1-iemin)/2);
  104. b1 = std::pow(ibeta, iexp); // lower boundary of midrange
  105. iexp = (iemax + 1 - it)/2;
  106. b2 = std::pow(ibeta,iexp); // upper boundary of midrange
  107. iexp = (2-iemin)/2;
  108. s1m = std::pow(ibeta,iexp); // scaling factor for lower range
  109. iexp = - ((iemax+it)/2);
  110. s2m = std::pow(ibeta,iexp); // scaling factor for upper range
  111. overfl = rbig*s2m; // overfow boundary for abig
  112. eps = std::pow(ibeta, 1-it);
  113. relerr = internal::sqrt(eps); // tolerance for neglecting asml
  114. abig = 1.0/eps - 1.0;
  115. if (Scalar(nbig)>abig) nmax = abig; // largest safe n
  116. else nmax = nbig;
  117. }
  118. typedef typename internal::packet_traits<Scalar>::type Packet;
  119. const int ps = internal::packet_traits<Scalar>::size;
  120. Packet pasml = internal::pset1(Scalar(0));
  121. Packet pamed = internal::pset1(Scalar(0));
  122. Packet pabig = internal::pset1(Scalar(0));
  123. Packet ps2m = internal::pset1(s2m);
  124. Packet ps1m = internal::pset1(s1m);
  125. Packet pb2 = internal::pset1(b2);
  126. Packet pb1 = internal::pset1(b1);
  127. for(int j=0; j<v.size(); j+=ps)
  128. {
  129. Packet ax = internal::pabs(v.template packet<Aligned>(j));
  130. Packet ax_s2m = internal::pmul(ax,ps2m);
  131. Packet ax_s1m = internal::pmul(ax,ps1m);
  132. Packet maskBig = internal::plt(pb2,ax);
  133. Packet maskSml = internal::plt(ax,pb1);
  134. // Packet maskMed = internal::pand(maskSml,maskBig);
  135. // Packet scale = internal::pset1(Scalar(0));
  136. // scale = internal::por(scale, internal::pand(maskBig,ps2m));
  137. // scale = internal::por(scale, internal::pand(maskSml,ps1m));
  138. // scale = internal::por(scale, internal::pandnot(internal::pset1(Scalar(1)),maskMed));
  139. // ax = internal::pmul(ax,scale);
  140. // ax = internal::pmul(ax,ax);
  141. // pabig = internal::padd(pabig, internal::pand(maskBig, ax));
  142. // pasml = internal::padd(pasml, internal::pand(maskSml, ax));
  143. // pamed = internal::padd(pamed, internal::pandnot(ax,maskMed));
  144. pabig = internal::padd(pabig, internal::pand(maskBig, internal::pmul(ax_s2m,ax_s2m)));
  145. pasml = internal::padd(pasml, internal::pand(maskSml, internal::pmul(ax_s1m,ax_s1m)));
  146. pamed = internal::padd(pamed, internal::pandnot(internal::pmul(ax,ax),internal::pand(maskSml,maskBig)));
  147. }
  148. Scalar abig = internal::predux(pabig);
  149. Scalar asml = internal::predux(pasml);
  150. Scalar amed = internal::predux(pamed);
  151. if(abig > Scalar(0))
  152. {
  153. abig = internal::sqrt(abig);
  154. if(abig > overfl)
  155. {
  156. eigen_assert(false && "overflow");
  157. return rbig;
  158. }
  159. if(amed > Scalar(0))
  160. {
  161. abig = abig/s2m;
  162. amed = internal::sqrt(amed);
  163. }
  164. else
  165. {
  166. return abig/s2m;
  167. }
  168. }
  169. else if(asml > Scalar(0))
  170. {
  171. if (amed > Scalar(0))
  172. {
  173. abig = internal::sqrt(amed);
  174. amed = internal::sqrt(asml) / s1m;
  175. }
  176. else
  177. {
  178. return internal::sqrt(asml)/s1m;
  179. }
  180. }
  181. else
  182. {
  183. return internal::sqrt(amed);
  184. }
  185. asml = std::min(abig, amed);
  186. abig = std::max(abig, amed);
  187. if(asml <= abig*relerr)
  188. return abig;
  189. else
  190. return abig * internal::sqrt(Scalar(1) + internal::abs2(asml/abig));
  191. #endif
  192. }
  193. #define BENCH_PERF(NRM) { \
  194. Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\
  195. for (int k=0; k<tries; ++k) { \
  196. tf.start(); \
  197. for (int i=0; i<iters; ++i) NRM(vf); \
  198. tf.stop(); \
  199. } \
  200. for (int k=0; k<tries; ++k) { \
  201. td.start(); \
  202. for (int i=0; i<iters; ++i) NRM(vd); \
  203. td.stop(); \
  204. } \
  205. for (int k=0; k<std::max(1,tries/3); ++k) { \
  206. tcf.start(); \
  207. for (int i=0; i<iters; ++i) NRM(vcf); \
  208. tcf.stop(); \
  209. } \
  210. std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \
  211. }
  212. void check_accuracy(double basef, double based, int s)
  213. {
  214. double yf = basef * internal::abs(internal::random<double>());
  215. double yd = based * internal::abs(internal::random<double>());
  216. VectorXf vf = VectorXf::Ones(s) * yf;
  217. VectorXd vd = VectorXd::Ones(s) * yd;
  218. std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
  219. std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n";
  220. std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n";
  221. std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n";
  222. std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\n";
  223. std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\n";
  224. std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\n";
  225. std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\n";
  226. }
  227. void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s)
  228. {
  229. VectorXf vf(s);
  230. VectorXd vd(s);
  231. for (int i=0; i<s; ++i)
  232. {
  233. vf[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1));
  234. vd[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1));
  235. }
  236. //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n";
  237. std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\t" << sqsumNorm(vf.cast<long double>()) << "\t" << sqsumNorm(vd.cast<long double>()) << "\n";
  238. std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\t" << hypotNorm(vf.cast<long double>()) << "\t" << hypotNorm(vd.cast<long double>()) << "\n";
  239. std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
  240. std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n";
  241. std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast<long double>()) << "\t" << lapackNorm(vd.cast<long double>()) << "\n";
  242. std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast<long double>()) << "\t" << twopassNorm(vd.cast<long double>()) << "\n";
  243. // std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast<long double>()) << "\t" << bl2passNorm(vd.cast<long double>()) << "\n";
  244. }
  245. int main(int argc, char** argv)
  246. {
  247. int tries = 10;
  248. int iters = 100000;
  249. double y = 1.1345743233455785456788e12 * internal::random<double>();
  250. VectorXf v = VectorXf::Ones(1024) * y;
  251. // return 0;
  252. int s = 10000;
  253. double basef_ok = 1.1345743233455785456788e15;
  254. double based_ok = 1.1345743233455785456788e95;
  255. double basef_under = 1.1345743233455785456788e-27;
  256. double based_under = 1.1345743233455785456788e-303;
  257. double basef_over = 1.1345743233455785456788e+27;
  258. double based_over = 1.1345743233455785456788e+302;
  259. std::cout.precision(20);
  260. std::cerr << "\nNo under/overflow:\n";
  261. check_accuracy(basef_ok, based_ok, s);
  262. std::cerr << "\nUnderflow:\n";
  263. check_accuracy(basef_under, based_under, s);
  264. std::cerr << "\nOverflow:\n";
  265. check_accuracy(basef_over, based_over, s);
  266. std::cerr << "\nVarying (over):\n";
  267. for (int k=0; k<1; ++k)
  268. {
  269. check_accuracy_var(20,27,190,302,s);
  270. std::cout << "\n";
  271. }
  272. std::cerr << "\nVarying (under):\n";
  273. for (int k=0; k<1; ++k)
  274. {
  275. check_accuracy_var(-27,20,-302,-190,s);
  276. std::cout << "\n";
  277. }
  278. std::cout.precision(4);
  279. std::cerr << "Performance (out of cache):\n";
  280. {
  281. int iters = 1;
  282. VectorXf vf = VectorXf::Random(1024*1024*32) * y;
  283. VectorXd vd = VectorXd::Random(1024*1024*32) * y;
  284. VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y;
  285. BENCH_PERF(sqsumNorm);
  286. BENCH_PERF(blueNorm);
  287. // BENCH_PERF(pblueNorm);
  288. // BENCH_PERF(lapackNorm);
  289. // BENCH_PERF(hypotNorm);
  290. // BENCH_PERF(twopassNorm);
  291. BENCH_PERF(bl2passNorm);
  292. }
  293. std::cerr << "\nPerformance (in cache):\n";
  294. {
  295. int iters = 100000;
  296. VectorXf vf = VectorXf::Random(512) * y;
  297. VectorXd vd = VectorXd::Random(512) * y;
  298. VectorXcf vcf = VectorXcf::Random(512) * y;
  299. BENCH_PERF(sqsumNorm);
  300. BENCH_PERF(blueNorm);
  301. // BENCH_PERF(pblueNorm);
  302. // BENCH_PERF(lapackNorm);
  303. // BENCH_PERF(hypotNorm);
  304. // BENCH_PERF(twopassNorm);
  305. BENCH_PERF(bl2passNorm);
  306. }
  307. }