3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-02-11 17:55:54 +00:00
Raziel K. Crowe 1c4c363d5c VC4Stdlib
2022-09-09 19:57:08 +05:00

365 lines
12 KiB
Common Lisp

#define arg_t float16
#define result_t float16
#define int_t int16
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
// vc4cl_split(double) of M_LN2
#define M_LN2_FF 0xB102E3083F317218
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
result_t pow2(int_t val)
{
// y = 2^x = 1.0 [implied] * 2^(x + offset)
int_t tmp = val << 23;
// alternative: tmp = (val + 127) << 23;
tmp += (int_t) 0x3F800000;
return CAT(as_, result_t)(tmp & (int_t) 0x7F800000);
}
int_t powerOfTwo(arg_t val)
{
// Original code, produces Inf for e^(~10^38)
// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f));
// Using floor() instead of ceil(),
// - fixes Inf for large exponents
// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP),
// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations
return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f));
}
/*
* Taylor series with Horner's method and range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-6
*/
result_t exp_taylor(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t tk = 1.0f;
arg_t tn = 1.0f;
#pragma loop unroll
for(int i = 1; i < 10; i++) // TODO can adjust number of iterations
{
tk *= r / i;
tn += tk;
};
tn = tn * pow2(k);
return val < 0 ? 1 / tn : tn;
}
__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_taylor(in[gid]);
}
result_t exp_taylor_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 tk = 0x000000003F800000; // 1.0
ulong16 tn = 0x000000003F800000; // 1.0
tk = vc4cl_mul(tk, r);
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9
tn = vc4cl_add(tn, tk);
// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP)
result_t result = vc4cl_lossy(tn) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_taylor_extended_precision_exact(in[gid]);
// }
// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp
/*
* Chebyshev interpolation with range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-18
*/
result_t exp_chebyshev(arg_t val)
{
// XXX could remove unneeded coefficients once we fix precision
const float coefficients[] = {
1.266065877752008335598244625214717537923,
1.130318207984970054415392055219726613610,
0.2714953395340765623657051399899818507081,
0.04433684984866380495257149525979922986386,
0.00547424044209373265027616843118645948703,
0.000542926311913943750362147810307554678760,
0.00004497732295429514665469032811091269841937,
3.198436462401990505863872976602295688795e-6,
1.992124806672795725961064384805589035648e-7,
1.103677172551734432616996091335324170860e-8,
5.50589607967374725047142040200552692791e-10,
2.497956616984982522712010934218766985311e-11,
1.039152230678570050499634672423840849837e-12,
3.991263356414401512887720401532162026594e-14,
};
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t ti = 1.0f;
arg_t tj = r;
arg_t p = coefficients[0] + (coefficients[1] * r);
#pragma loop unroll
for(int i = 2; i < 8; i++) // TODO can adjust number of iterations
{
arg_t tk = (2 * r * tj) - ti;
p += coefficients[i] * tk;
ti = tj;
tj = tk;
}
p = p * pow2(k);
return val < 0 ? 1 / p : p;
}
__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev(in[gid]);
}
result_t exp_chebyshev_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 ti = 0x000000003F800000; // 1.0
ulong16 tj = r;
// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610
ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r));
r = vc4cl_mul(r, 0x0000000040000000); // 2.0
ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7
ti = tj;
tj = tk;
// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP)
result_t result = vc4cl_lossy(p) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_chebyshev_extended_precision_exact(in[gid]);
// }
/*
* Chebyshev interpolation with monomial basis and range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-18
*/
result_t exp_chebyshev_monomial(arg_t val)
{
// XXX could remove unneeded coefficients once we fix precision
// TODO invert order of coefficients and traversal ?!
const float coefficients[] = {
1.000000000000000,
1.000000000000000,
0.500000000000002,
0.166666666666680,
0.041666666666727,
0.008333333333342,
0.001388888888388,
1.984126978734782e-4,
2.480158866546844e-5,
2.755734045527853e-6,
2.755715675968011e-7,
2.504861486483735e-8,
2.088459690899721e-9,
1.632461784798319e-10,
};
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t pn = 1.143364767943110e-11;
#pragma loop unroll
for(int i = 0; i < 14; i++)
{
pn = pn * r + coefficients[13 - i];
}
pn = pn * pow2(k);
return val < 0 ? 1 / pn : pn;
}
__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev_monomial(in[gid]);
}
result_t exp_chebyshev_monomial_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t kFloat = CAT(convert_, arg_t)(k);
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
arg_t pn = 1.143364767943110e-11;
pn = pn * r + 1.632461784798319e-10f;
pn = pn * r + 2.088459690899721e-9f;
pn = pn * r + 2.504861486483735e-8f;
pn = pn * r + 2.755715675968011e-7f;
pn = pn * r + 2.755734045527853e-6f;
pn = pn * r + 2.480158866546844e-5f;
pn = pn * r + 1.984126978734782e-4f;
pn = pn * r + 0.001388888888388f;
pn = pn * r + 0.008333333333342f;
pn = pn * r + 0.041666666666727f;
pn = pn * r + 0.166666666666680f;
pn = pn * r + 0.500000000000002f;
pn = pn * r + 1.000000000000000f;
pn = pn * r + 1.000000000000000f;
pn = pn * pow2(k);
return val < 0 ? 1 / pn : pn;
}
__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev_monomial_exact(in[gid]);
}
result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 pn = 0x209249252D492492; // 1.143364767943110e-11
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10
pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680
pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
result_t result = vc4cl_lossy(pn) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]);
// }
// TODO Remes from www.netlib.org/fdlibm/e_exp.c
// TODO Matters computational (sections 32.2.2.2 and 32.2.3)
// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4)
// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1
// TODO http://www.netlib.org/fdlibm/
__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp(in[gid]);
}