3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-02-01 07:42:18 +00:00
Brooklyn/drivers/videocore4_stdlib/experimental/log.cl
Raziel K. Crowe 1c4c363d5c VC4Stdlib
2022-09-09 19:57:08 +05:00

257 lines
6.5 KiB
Common Lisp

#define arg_t float16
#define result_t float16
#define int_t int16
/*
* Helper, arithmetic-geometric-mean,
*
* https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean
*/
result_t agm(arg_t x, arg_t y)
{
arg_t arithm = x;
arg_t geom = y;
for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations
{
arg_t arithm_new = (arithm + geom) / (arg_t) 2.0;
geom = sqrt(arithm * geom);
arithm = arithm_new;
}
return arithm;
}
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
#define REDUCE_ARGUMENT_TO_0_1 \
/* log(S * M * 2^E) = log(S * M) + E log(2) */ \
int_t bitcast = CAT(as_, int_t)(val); \
/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */ \
int_t exponent = ((bitcast >> 23) & 0xFF) - 126; \
/* mask off exponent and replace with exponent for range [0.5, 1) */ \
int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000; \
arg_t mantissa = CAT(as_, result_t)(signedMantissaBits); \
result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F;
/*
* Taylor-series,
*
* https://en.wikipedia.org/wiki/Mercator_series
*/
result_t log1p_taylor(arg_t val)
{
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
// converges for -1 < x <= 1 (requires argument reduction)
REDUCE_ARGUMENT_TO_0_1
// iteration 1
result_t result = mantissa;
arg_t power = mantissa;
#pragma loop unroll
for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations
{
power *= mantissa;
arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0;
result = result + sign * (arg_t) (1.0 / iteration) * power;
}
return result + reduced;
}
__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f);
}
result_t log1p_taylor_unrolled(arg_t val)
{
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
// converges for -1 < x <= 1 (requires argument reduction)
REDUCE_ARGUMENT_TO_0_1
// iteration 1
result_t result = mantissa;
arg_t power = mantissa;
// iteration 2
power *= mantissa;
result = result - (arg_t) (1.0 / 2.0) * power;
// iteration 3
power *= mantissa;
result = result + (arg_t) (1.0 / 3.0) * power;
// iteration 4
power *= mantissa;
result = result - (arg_t) (1.0 / 4.0) * power;
// iteration 5
power *= mantissa;
result = result + (arg_t) (1.0 / 5.0) * power;
// iteration 6
power *= mantissa;
result = result - (arg_t) (1.0 / 6.0) * power;
// iteration 7
power *= mantissa;
result = result + (arg_t) (1.0 / 7.0) * power;
// iteration 8
power *= mantissa;
result = result - (arg_t) (1.0 / 8.0) * power;
// iteration 9
power *= mantissa;
result = result + (arg_t) (1.0 / 9.0) * power;
// iteration 10
power *= mantissa;
result = result - (arg_t) (1.0 / 10.0) * power;
// iteration 11
power *= mantissa;
result = result + (arg_t) (1.0 / 11.0) * power;
// iteration 12
power *= mantissa;
result = result - (arg_t) (1.0 / 12.0) * power;
// iteration 13
power *= mantissa;
result = result + (arg_t) (1.0 / 13.0) * power;
// iteration 14
power *= mantissa;
result = result - (arg_t) (1.0 / 14.0) * power;
// iteration 15
power *= mantissa;
result = result + (arg_t) (1.0 / 15.0) * power;
// iteration 16
power *= mantissa;
result = result - (arg_t) (1.0 / 16.0) * power;
// iteration 17
power *= mantissa;
result = result + (arg_t) (1.0 / 17.0) * power;
// iteration 18
power *= mantissa;
result = result - (arg_t) (1.0 / 18.0) * power;
// iteration 19
power *= mantissa;
result = result + (arg_t) (1.0 / 19.0) * power;
// iteration 20
power *= mantissa;
result = result - (arg_t) (1.0 / 20.0) * power;
// iteration 21
power *= mantissa;
result = result + (arg_t) (1.0 / 21.0) * power;
// iteration 22
power *= mantissa;
result = result - (arg_t) (1.0 / 22.0) * power;
// iteration 23
power *= mantissa;
result = result + (arg_t) (1.0 / 23.0) * power;
// iteration 24
power *= mantissa;
result = result - (arg_t) (1.0 / 24.0) * power;
// iteration 25
power *= mantissa;
result = result + (arg_t) (1.0 / 25.0) * power;
// iteration 26
power *= mantissa;
result = result - (arg_t) (1.0 / 26.0) * power;
// TODO can adjust number of iterations
return result + reduced;
}
__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f);
}
/*
* Taylor series with optimization, requires argument reduction,
*
* https://math.stackexchange.com/a/3383716
*/
result_t log_taylor(arg_t val)
{
REDUCE_ARGUMENT_TO_0_1
result_t result = 0;
// iteration 1
arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0);
arg_t factor = tmp * tmp;
#pragma loop unroll
for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations
{
result += tmp / (arg_t) iteration;
tmp *= factor;
}
return result + reduced;
}
__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_taylor(in[gid]);
}
/*
* Using the arithmetic-geometric-mean,
*
* https://en.wikipedia.org/wiki/Natural_logarithm#High_precision
*/
result_t log_agm(arg_t val)
{
const unsigned m = 8; // TODO can adjust for precision
arg_t s = val * (arg_t) (1 << m);
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2);
}
__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_agm(in[gid]);
}
result_t log_agm_reduced(arg_t val)
{
REDUCE_ARGUMENT_TO_0_1
const unsigned m = 8; // TODO can adjust for precision
arg_t s = mantissa * (arg_t) (1 << m);
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced;
}
__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_agm_reduced(in[gid]);
}
__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log(in[gid]);
}