mirror of
https://github.com/Qortal/Brooklyn.git
synced 2025-02-11 17:55:54 +00:00
365 lines
12 KiB
Common Lisp
365 lines
12 KiB
Common Lisp
|
|
#define arg_t float16
|
|
#define result_t float16
|
|
#define int_t int16
|
|
|
|
#define CONCAT(a, b) a##b
|
|
#define CAT(a, b) CONCAT(a, b)
|
|
|
|
// vc4cl_split(double) of M_LN2
|
|
#define M_LN2_FF 0xB102E3083F317218
|
|
|
|
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
|
|
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
|
|
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
|
|
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
|
|
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
|
|
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
|
|
|
|
result_t pow2(int_t val)
|
|
{
|
|
// y = 2^x = 1.0 [implied] * 2^(x + offset)
|
|
int_t tmp = val << 23;
|
|
// alternative: tmp = (val + 127) << 23;
|
|
tmp += (int_t) 0x3F800000;
|
|
return CAT(as_, result_t)(tmp & (int_t) 0x7F800000);
|
|
}
|
|
|
|
int_t powerOfTwo(arg_t val)
|
|
{
|
|
// Original code, produces Inf for e^(~10^38)
|
|
// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f));
|
|
// Using floor() instead of ceil(),
|
|
// - fixes Inf for large exponents
|
|
// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP),
|
|
// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations
|
|
return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f));
|
|
}
|
|
|
|
/*
|
|
* Taylor series with Horner's method and range reduction,
|
|
*
|
|
* https://www.pseudorandom.com/implementing-exp#section-6
|
|
*/
|
|
result_t exp_taylor(arg_t val)
|
|
{
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
|
|
|
arg_t tk = 1.0f;
|
|
arg_t tn = 1.0f;
|
|
#pragma loop unroll
|
|
for(int i = 1; i < 10; i++) // TODO can adjust number of iterations
|
|
{
|
|
tk *= r / i;
|
|
tn += tk;
|
|
};
|
|
|
|
tn = tn * pow2(k);
|
|
return val < 0 ? 1 / tn : tn;
|
|
}
|
|
|
|
__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in)
|
|
{
|
|
uint gid = get_global_id(0);
|
|
out[gid] = exp_taylor(in[gid]);
|
|
}
|
|
|
|
result_t exp_taylor_extended_precision_exact(arg_t val)
|
|
{
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
|
|
|
ulong16 tk = 0x000000003F800000; // 1.0
|
|
ulong16 tn = 0x000000003F800000; // 1.0
|
|
|
|
tk = vc4cl_mul(tk, r);
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8
|
|
tn = vc4cl_add(tn, tk);
|
|
|
|
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9
|
|
tn = vc4cl_add(tn, tk);
|
|
// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP)
|
|
|
|
result_t result = vc4cl_lossy(tn) * pow2(k);
|
|
return val < 0 ? 1.0f / result : result;
|
|
}
|
|
|
|
// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
|
// {
|
|
// uint gid = get_global_id(0);
|
|
// out[gid] = exp_taylor_extended_precision_exact(in[gid]);
|
|
// }
|
|
|
|
// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp
|
|
|
|
/*
|
|
* Chebyshev interpolation with range reduction,
|
|
*
|
|
* https://www.pseudorandom.com/implementing-exp#section-18
|
|
*/
|
|
result_t exp_chebyshev(arg_t val)
|
|
{
|
|
// XXX could remove unneeded coefficients once we fix precision
|
|
const float coefficients[] = {
|
|
1.266065877752008335598244625214717537923,
|
|
1.130318207984970054415392055219726613610,
|
|
0.2714953395340765623657051399899818507081,
|
|
0.04433684984866380495257149525979922986386,
|
|
0.00547424044209373265027616843118645948703,
|
|
0.000542926311913943750362147810307554678760,
|
|
0.00004497732295429514665469032811091269841937,
|
|
3.198436462401990505863872976602295688795e-6,
|
|
1.992124806672795725961064384805589035648e-7,
|
|
1.103677172551734432616996091335324170860e-8,
|
|
5.50589607967374725047142040200552692791e-10,
|
|
2.497956616984982522712010934218766985311e-11,
|
|
1.039152230678570050499634672423840849837e-12,
|
|
3.991263356414401512887720401532162026594e-14,
|
|
};
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
|
|
|
arg_t ti = 1.0f;
|
|
arg_t tj = r;
|
|
arg_t p = coefficients[0] + (coefficients[1] * r);
|
|
#pragma loop unroll
|
|
for(int i = 2; i < 8; i++) // TODO can adjust number of iterations
|
|
{
|
|
arg_t tk = (2 * r * tj) - ti;
|
|
p += coefficients[i] * tk;
|
|
ti = tj;
|
|
tj = tk;
|
|
}
|
|
|
|
p = p * pow2(k);
|
|
return val < 0 ? 1 / p : p;
|
|
}
|
|
|
|
__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in)
|
|
{
|
|
uint gid = get_global_id(0);
|
|
out[gid] = exp_chebyshev(in[gid]);
|
|
}
|
|
|
|
result_t exp_chebyshev_extended_precision_exact(arg_t val)
|
|
{
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
|
|
|
ulong16 ti = 0x000000003F800000; // 1.0
|
|
ulong16 tj = r;
|
|
// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610
|
|
ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r));
|
|
r = vc4cl_mul(r, 0x0000000040000000); // 2.0
|
|
|
|
ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6
|
|
ti = tj;
|
|
tj = tk;
|
|
|
|
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
|
p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7
|
|
ti = tj;
|
|
tj = tk;
|
|
// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP)
|
|
|
|
result_t result = vc4cl_lossy(p) * pow2(k);
|
|
return val < 0 ? 1.0f / result : result;
|
|
}
|
|
|
|
// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
|
// {
|
|
// uint gid = get_global_id(0);
|
|
// out[gid] = exp_chebyshev_extended_precision_exact(in[gid]);
|
|
// }
|
|
|
|
/*
|
|
* Chebyshev interpolation with monomial basis and range reduction,
|
|
*
|
|
* https://www.pseudorandom.com/implementing-exp#section-18
|
|
*/
|
|
result_t exp_chebyshev_monomial(arg_t val)
|
|
{
|
|
// XXX could remove unneeded coefficients once we fix precision
|
|
// TODO invert order of coefficients and traversal ?!
|
|
const float coefficients[] = {
|
|
1.000000000000000,
|
|
1.000000000000000,
|
|
0.500000000000002,
|
|
0.166666666666680,
|
|
0.041666666666727,
|
|
0.008333333333342,
|
|
0.001388888888388,
|
|
1.984126978734782e-4,
|
|
2.480158866546844e-5,
|
|
2.755734045527853e-6,
|
|
2.755715675968011e-7,
|
|
2.504861486483735e-8,
|
|
2.088459690899721e-9,
|
|
1.632461784798319e-10,
|
|
};
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
|
|
|
arg_t pn = 1.143364767943110e-11;
|
|
#pragma loop unroll
|
|
for(int i = 0; i < 14; i++)
|
|
{
|
|
pn = pn * r + coefficients[13 - i];
|
|
}
|
|
|
|
pn = pn * pow2(k);
|
|
return val < 0 ? 1 / pn : pn;
|
|
}
|
|
|
|
__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in)
|
|
{
|
|
uint gid = get_global_id(0);
|
|
out[gid] = exp_chebyshev_monomial(in[gid]);
|
|
}
|
|
|
|
result_t exp_chebyshev_monomial_exact(arg_t val)
|
|
{
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
arg_t kFloat = CAT(convert_, arg_t)(k);
|
|
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
|
|
|
|
arg_t pn = 1.143364767943110e-11;
|
|
|
|
pn = pn * r + 1.632461784798319e-10f;
|
|
pn = pn * r + 2.088459690899721e-9f;
|
|
pn = pn * r + 2.504861486483735e-8f;
|
|
pn = pn * r + 2.755715675968011e-7f;
|
|
pn = pn * r + 2.755734045527853e-6f;
|
|
pn = pn * r + 2.480158866546844e-5f;
|
|
pn = pn * r + 1.984126978734782e-4f;
|
|
pn = pn * r + 0.001388888888388f;
|
|
pn = pn * r + 0.008333333333342f;
|
|
pn = pn * r + 0.041666666666727f;
|
|
pn = pn * r + 0.166666666666680f;
|
|
pn = pn * r + 0.500000000000002f;
|
|
pn = pn * r + 1.000000000000000f;
|
|
pn = pn * r + 1.000000000000000f;
|
|
|
|
pn = pn * pow2(k);
|
|
return val < 0 ? 1 / pn : pn;
|
|
}
|
|
|
|
__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
|
{
|
|
uint gid = get_global_id(0);
|
|
out[gid] = exp_chebyshev_monomial_exact(in[gid]);
|
|
}
|
|
|
|
result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val)
|
|
{
|
|
arg_t positive = fabs(val);
|
|
|
|
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
|
int_t k = powerOfTwo(positive);
|
|
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
|
|
|
ulong16 pn = 0x209249252D492492; // 1.143364767943110e-11
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
|
|
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
|
|
|
|
result_t result = vc4cl_lossy(pn) * pow2(k);
|
|
return val < 0 ? 1.0f / result : result;
|
|
}
|
|
|
|
// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
|
// {
|
|
// uint gid = get_global_id(0);
|
|
// out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]);
|
|
// }
|
|
|
|
// TODO Remes from www.netlib.org/fdlibm/e_exp.c
|
|
|
|
// TODO Matters computational (sections 32.2.2.2 and 32.2.3)
|
|
// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4)
|
|
|
|
// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1
|
|
// TODO http://www.netlib.org/fdlibm/
|
|
|
|
__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in)
|
|
{
|
|
uint gid = get_global_id(0);
|
|
out[gid] = exp(in[gid]);
|
|
}
|