Browse Source

More memory efficient constants:

* src/float/transcendental/cl_LF_pi.cc (compute_pi_ramanujan_163_fast):
        Compute series coefficients on demand, using a series stream object.
        * src/float/transcendental/cl_LF_zeta3.cc (zeta3): Likewise.
        * src/float/transcendental/cl_LF_catalanconst.cc
        (compute_catalanconst_ramanujan_fast): Likewise.
        (compute_catalanconst_lupas): New function.
        (compute_catalanconst): Simplify, based on new benchmark.
master
Richard Kreckel 18 years ago
parent
commit
38f5f03b3f
  1. 11
      ChangeLog
  2. 120
      src/float/transcendental/cl_LF_catalanconst.cc
  3. 50
      src/float/transcendental/cl_LF_pi.cc
  4. 42
      src/float/transcendental/cl_LF_zeta3.cc

11
ChangeLog

@ -1,3 +1,14 @@
2007-04-09 Richard B. Kreckel <kreckel@ginac.de>
More memory efficient constants:
* src/float/transcendental/cl_LF_pi.cc (compute_pi_ramanujan_163_fast):
Compute series coefficients on demand, using a series stream object.
* src/float/transcendental/cl_LF_zeta3.cc (zeta3): Likewise.
* src/float/transcendental/cl_LF_catalanconst.cc
(compute_catalanconst_ramanujan_fast): Likewise.
(compute_catalanconst_lupas): New function.
(compute_catalanconst): Simplify, based on new benchmark.
2007-04-02 Alexei Sheplyakov <varg@theor.jinr.ru>
Debian Bug#412103:

120
src/float/transcendental/cl_LF_catalanconst.cc

@ -50,35 +50,37 @@ const cl_LF compute_catalanconst_ramanujan_fast (uintC len)
{
// Same formula as above, using a binary splitting evaluation.
// See [Borwein, Borwein, section 10.2.3].
var uintC actuallen = len + 2; // 2 Schutz-Digits
struct rational_series_stream : cl_pqb_series_stream {
cl_I n;
static cl_pqb_series_term computenext (cl_pqb_series_stream& thisss)
{
var rational_series_stream& thiss = (rational_series_stream&)thisss;
var cl_I n = thiss.n;
var cl_pqb_series_term result;
if (n==0) {
result.p = 1;
result.q = 1;
result.b = 1;
} else {
result.p = n;
result.b = 2*n+1;
result.q = result.b << 1; // 2*(2*n+1)
}
thiss.n = n+1;
return result;
}
rational_series_stream ()
: cl_pqb_series_stream (rational_series_stream::computenext),
n (0) {}
} series;
var uintC actuallen = len + 2; // 2 guard digits
// Evaluate a sum(0 <= n < N, a(n)/b(n) * (p(0)...p(n))/(q(0)...q(n)))
// with appropriate N, and
// a(n) = 1, b(n) = 2*n+1,
// p(n) = n for n>0, q(n) = 2*(2*n+1) for n>0.
var uintC N = (intDsize/2)*actuallen;
// 4^-N <= 2^(-intDsize*actuallen).
CL_ALLOCA_STACK;
var cl_I* bv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* pv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* qv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var uintC n;
init1(cl_I, bv[0]) (1);
init1(cl_I, pv[0]) (1);
init1(cl_I, qv[0]) (1);
for (n = 1; n < N; n++) {
init1(cl_I, bv[n]) (2*n+1);
init1(cl_I, pv[n]) (n);
init1(cl_I, qv[n]) (2*(2*n+1));
}
var cl_pqb_series series;
series.bv = bv;
series.pv = pv; series.qv = qv; series.qsv = NULL;
var cl_LF fsum = eval_rational_series(N,series,actuallen);
for (n = 0; n < N; n++) {
bv[n].~cl_I();
pv[n].~cl_I();
qv[n].~cl_I();
}
var cl_LF g =
scale_float(The(cl_LF)(3*fsum)
+ The(cl_LF)(pi(actuallen))
@ -229,30 +231,66 @@ const cl_LF compute_catalanconst_cvz2 (uintC len)
}
// Bit complexity (N = len): O(log(N)^2*M(N)).
// Timings of the above algorithms, on an i486 33 MHz, running Linux.
// N ram ramfast exp1 exp2 cvz1 cvz2
// 10 0.055 0.068 0.32 0.91 0.076 0.11
// 25 0.17 0.26 0.95 3.78 0.23 0.43
// 50 0.43 0.73 2.81 11.5 0.63 1.36
// 100 1.32 2.24 8.82 34.1 1.90 4.48
// 250 6.60 10.4 48.7 127.5 10.3 20.8
// 500 24.0 30.9 186 329 38.4 58.6
// 1000 83.0 89.0 944 860 149 163
// 2500 446 352 6964 3096 1032 545
// 5000 1547 899
// asymp. N^2 FAST N^2 FAST N^2 FAST
const cl_LF compute_catalanconst_lupas (uintC len)
{
// G = 19/18 * sum(n=0..infty,
// mul(m=1..n, -32*((80*m^3+72*m^2-18*m-19)*m^3)/
// (10240*m^6+14336*m^5+2560*m^4-3072*m^3-888*m^2+72*m+27))).
struct rational_series_stream : cl_pq_series_stream {
cl_I n;
static cl_pq_series_term computenext (cl_pq_series_stream& thisss)
{
var rational_series_stream& thiss = (rational_series_stream&)thisss;
var cl_I n = thiss.n;
var cl_pq_series_term result;
if (zerop(n)) {
result.p = 1;
result.q = 1;
} else {
// Compute -32*((80*n^3+72*n^2-18*n-19)*n^3) (using Horner scheme):
result.p = (19+(18+(-72-80*n)*n)*n)*n*n*n << 5;
// Compute 10240*n^6+14336*n^5+2560*n^4-3072*n^3-888*n^2+72*n+27:
result.q = 27+(72+(-888+(-3072+(2560+(14336+10240*n)*n)*n)*n)*n)*n;
}
thiss.n = plus1(n);
return result;
}
rational_series_stream ()
: cl_pq_series_stream (rational_series_stream::computenext),
n (0) {}
} series;
var uintC actuallen = len + 2; // 2 guard digits
var uintC N = (intDsize/2)*actuallen;
var cl_LF fsum = eval_rational_series(N,series,actuallen);
var cl_LF g = fsum*cl_I_to_LF(19,actuallen)/cl_I_to_LF(18,actuallen);
return shorten(g,len);
}
// Bit complexity (N = len): O(log(N)^2*M(N)).
// Timings of the above algorithms, on an AMD Opteron 1.7 GHz, running Linux/x86.
// N ram ramfast exp1 exp2 cvz1 cvz2 lupas
// 25 0.0011 0.0010 0.0094 0.0107 0.0021 0.0016 0.0034
// 50 0.0030 0.0025 0.0280 0.0317 0.0058 0.0045 0.0095
// 100 0.0087 0.0067 0.0854 0.0941 0.0176 0.0121 0.0260
// 250 0.043 0.029 0.462 0.393 0.088 0.055 0.109
// 500 0.15 0.086 1.7 1.156 0.323 0.162 0.315
// 1000 0.57 0.25 7.5 3.23 1.27 0.487 0.864
// 2500 3.24 1.10 52.2 12.4 8.04 2.02 3.33
// 5000 13.1 3.06 218 32.7 42.1 5.59 8.80
// 10000 52.7 8.2 910 85.3 216.7 15.3 22.7
// 25000 342 29.7 6403 295 1643 54.5 77.3
// 50000 89.9 139 195
//100000 227 363 483
// asymp. N^2 FAST N^2 FAST N^2 FAST FAST
// (FAST means O(log(N)^2*M(N)))
//
// The "exp1" and "exp2" algorithms are always about 10 to 15 times slower
// than the "ram" and "ramfast" algorithms.
// The break-even point between "ram" and "ramfast" is at about N = 1410.
// The "ramfast" algorithm is always fastest.
const cl_LF compute_catalanconst (uintC len)
{
if (len >= 1410)
return compute_catalanconst_ramanujan_fast(len);
else
return compute_catalanconst_ramanujan(len);
return compute_catalanconst_ramanujan_fast(len);
}
// Bit complexity (N := len): O(log(N)^2*M(N)).

50
src/float/transcendental/cl_LF_pi.cc

@ -191,6 +191,31 @@ const cl_LF compute_pi_ramanujan_163_fast (uintC len)
{
// Same formula as above, using a binary splitting evaluation.
// See [Borwein, Borwein, section 10.2.3].
struct rational_series_stream : cl_pqa_series_stream {
uintC n;
static cl_pqa_series_term computenext (cl_pqa_series_stream& thisss)
{
static const cl_I A = "163096908";
static const cl_I B = "6541681608";
static const cl_I J1 = "10939058860032000"; // 72*abs(J)
var rational_series_stream& thiss = (rational_series_stream&)thisss;
var uintC n = thiss.n;
var cl_pqa_series_term result;
if (n==0) {
result.p = 1;
result.q = 1;
} else {
result.p = -((cl_I)(6*n-5)*(cl_I)(2*n-1)*(cl_I)(6*n-1));
result.q = (cl_I)n*(cl_I)n*(cl_I)n*J1;
}
result.a = A+n*B;
thiss.n = n+1;
return result;
}
rational_series_stream ()
: cl_pqa_series_stream (rational_series_stream::computenext),
n (0) {}
} series;
var uintC actuallen = len + 4; // 4 Schutz-Digits
static const cl_I A = "163096908";
static const cl_I B = "6541681608";
@ -205,32 +230,7 @@ const cl_LF compute_pi_ramanujan_163_fast (uintC len)
var uintC N = (n_slope*actuallen)/32 + 1;
// N > intDsize*log(2)/log(|J|) * actuallen, hence
// |J|^-N < 2^(-intDsize*actuallen).
CL_ALLOCA_STACK;
var cl_I* av = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* pv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* qv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var uintC* qsv = (uintC*) cl_alloca(N*sizeof(uintC));
var uintC n;
for (n = 0; n < N; n++) {
init1(cl_I, av[n]) (A+n*B);
if (n==0) {
init1(cl_I, pv[n]) (1);
init1(cl_I, qv[n]) (1);
} else {
init1(cl_I, pv[n]) (-((cl_I)(6*n-5)*(cl_I)(2*n-1)*(cl_I)(6*n-1)));
init1(cl_I, qv[n]) ((cl_I)n*(cl_I)n*(cl_I)n*J1);
}
}
var cl_pqa_series series;
series.av = av;
series.pv = pv; series.qv = qv;
series.qsv = (len >= 35 ? qsv : 0); // 5% speedup for large len's
var cl_LF fsum = eval_rational_series(N,series,actuallen);
for (n = 0; n < N; n++) {
av[n].~cl_I();
pv[n].~cl_I();
qv[n].~cl_I();
}
static const cl_I J3 = "262537412640768000"; // -1728*J
var cl_LF pires = sqrt(cl_I_to_LF(J3,actuallen)) / fsum;
return shorten(pires,len); // verkürzen und fertig

42
src/float/transcendental/cl_LF_zeta3.cc

@ -19,6 +19,27 @@ namespace cln {
const cl_LF zeta3 (uintC len)
{
struct rational_series_stream : cl_pqa_series_stream {
uintC n;
static cl_pqa_series_term computenext (cl_pqa_series_stream& thisss)
{
var rational_series_stream& thiss = (rational_series_stream&)thisss;
var uintC n = thiss.n;
var cl_pqa_series_term result;
if (n==0) {
result.p = 1;
} else {
result.p = -expt_pos(n,5);
}
result.q = expt_pos(2*n+1,5)<<5;
result.a = 205*square((cl_I)n) + 250*(cl_I)n + 77;
thiss.n = n+1;
return result;
}
rational_series_stream ()
: cl_pqa_series_stream (rational_series_stream::computenext),
n (0) {}
} series;
// Method:
// /infinity \
// | ----- (n + 1) 2 |
@ -41,28 +62,7 @@ const cl_LF zeta3 (uintC len)
var uintC actuallen = len+2; // 2 Schutz-Digits
var uintC N = ceiling(actuallen*intDsize,10);
// 1024^-N <= 2^(-intDsize*actuallen).
CL_ALLOCA_STACK;
var cl_I* av = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* pv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var cl_I* qv = (cl_I*) cl_alloca(N*sizeof(cl_I));
var uintC n;
for (n = 0; n < N; n++) {
init1(cl_I, av[n]) (205*square((cl_I)n) + 250*(cl_I)n + 77);
if (n==0)
init1(cl_I, pv[n]) (1);
else
init1(cl_I, pv[n]) (-expt_pos(n,5));
init1(cl_I, qv[n]) (expt_pos(2*n+1,5)<<5);
}
var cl_pqa_series series;
series.av = av;
series.pv = pv; series.qv = qv; series.qsv = NULL;
var cl_LF sum = eval_rational_series(N,series,actuallen);
for (n = 0; n < N; n++) {
av[n].~cl_I();
pv[n].~cl_I();
qv[n].~cl_I();
}
return scale_float(shorten(sum,len),-1);
}
// Bit complexity (N := len): O(log(N)^2*M(N)).

Loading…
Cancel
Save