3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-02-01 07:42:18 +00:00
Brooklyn/drivers/videocore4_stdlib/experimental/cbrt.cl
Raziel K. Crowe 1c4c363d5c VC4Stdlib
2022-09-09 19:57:08 +05:00

92 lines
2.5 KiB
Common Lisp

#define arg_t float16
#define result_t float16
#define int_t int16
#define uint_t uint16
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
result_t approx_rootn(arg_t x, int_t n)
{
// Divides the exponent by n and emplaces it back into the number
// Adapted from: https://web.archive.org/web/20131227144655/http://metamerist.com/cbrt/cbrt.htm
int_t i = CAT(as_, int_t)(x);
int_t exp = (i - (int_t) (127 << 23)) / n + (int_t) (127 << 23);
return CAT(as_, result_t)((i & (int_t) 0x807FFFFF) | (exp));
}
result_t approx_cbrt(arg_t f)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
uint_t p = CAT(as_, uint_t)(f);
p = p / 3 + 709921077;
return CAT(as_, result_t)(p);
}
result_t cbrt_halley_step(arg_t x, arg_t base)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
result_t x3 = x * x * x;
return x * (x3 + base + base) / (x3 + x3 + base);
}
result_t cbrt_halley(arg_t val)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
arg_t abs = fabs(val);
arg_t approx = approx_rootn(abs, 3);
result_t result = approx;
#pragma loop unroll
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
{
result = cbrt_halley_step(result, val);
}
return copysign(result, val);
}
__kernel void cbrt_halley_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt_halley(in[gid]);
}
result_t cbrt_newton_step(arg_t x, arg_t base)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
return x - (1.0f / 3.0f) * (x - base / (x * x));
}
result_t cbrt_newton(arg_t val)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
arg_t abs = fabs(val);
arg_t approx = approx_cbrt(abs);
result_t result = approx;
#pragma loop unroll
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
{
result = cbrt_newton_step(result, val);
}
return copysign(result, val);
}
__kernel void cbrt_newton_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt_newton(in[gid]);
}
__kernel void cbrt_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt(in[gid]);
}
__kernel void cbrt_pow_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = pow(in[gid], 1.0f / 3.0f);
}