#define arg_t float16
#define result_t float16
#define int_t int16

/*
 * Helper, arithmetic-geometric-mean,
 *
 * https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean
 */
result_t agm(arg_t x, arg_t y)
{
	arg_t arithm = x;
	arg_t geom = y;
	for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations
	{
		arg_t arithm_new = (arithm + geom) / (arg_t) 2.0;
		geom = sqrt(arithm * geom);
		arithm = arithm_new;
	}
	return arithm;
}

#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)

#define REDUCE_ARGUMENT_TO_0_1                                                                                         \
	/* log(S * M * 2^E) = log(S * M) + E log(2) */                                                                     \
	int_t bitcast = CAT(as_, int_t)(val);                                                                              \
	/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */                                           \
	int_t exponent = ((bitcast >> 23) & 0xFF) - 126;                                                                   \
	/* mask off exponent and replace with exponent for range [0.5, 1) */                                               \
	int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000;                                    \
	arg_t mantissa = CAT(as_, result_t)(signedMantissaBits);                                                           \
	result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F;

/*
 * Taylor-series,
 *
 * https://en.wikipedia.org/wiki/Mercator_series
 */
result_t log1p_taylor(arg_t val)
{
	// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
	// converges for -1 < x <= 1 (requires argument reduction)

	REDUCE_ARGUMENT_TO_0_1

	// iteration 1
	result_t result = mantissa;
	arg_t power = mantissa;
#pragma loop unroll
	for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations
	{
		power *= mantissa;
		arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0;
		result = result + sign * (arg_t) (1.0 / iteration) * power;
	}
	return result + reduced;
}

__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f);
}

result_t log1p_taylor_unrolled(arg_t val)
{
	// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
	// converges for -1 < x <= 1 (requires argument reduction)

	REDUCE_ARGUMENT_TO_0_1

	// iteration 1
	result_t result = mantissa;
	arg_t power = mantissa;

	// iteration 2
	power *= mantissa;
	result = result - (arg_t) (1.0 / 2.0) * power;

	// iteration 3
	power *= mantissa;
	result = result + (arg_t) (1.0 / 3.0) * power;

	// iteration 4
	power *= mantissa;
	result = result - (arg_t) (1.0 / 4.0) * power;

	// iteration 5
	power *= mantissa;
	result = result + (arg_t) (1.0 / 5.0) * power;

	// iteration 6
	power *= mantissa;
	result = result - (arg_t) (1.0 / 6.0) * power;

	// iteration 7
	power *= mantissa;
	result = result + (arg_t) (1.0 / 7.0) * power;

	// iteration 8
	power *= mantissa;
	result = result - (arg_t) (1.0 / 8.0) * power;

	// iteration 9
	power *= mantissa;
	result = result + (arg_t) (1.0 / 9.0) * power;

	// iteration 10
	power *= mantissa;
	result = result - (arg_t) (1.0 / 10.0) * power;

	// iteration 11
	power *= mantissa;
	result = result + (arg_t) (1.0 / 11.0) * power;

	// iteration 12
	power *= mantissa;
	result = result - (arg_t) (1.0 / 12.0) * power;

	// iteration 13
	power *= mantissa;
	result = result + (arg_t) (1.0 / 13.0) * power;

	// iteration 14
	power *= mantissa;
	result = result - (arg_t) (1.0 / 14.0) * power;

	// iteration 15
	power *= mantissa;
	result = result + (arg_t) (1.0 / 15.0) * power;

	// iteration 16
	power *= mantissa;
	result = result - (arg_t) (1.0 / 16.0) * power;

	// iteration 17
	power *= mantissa;
	result = result + (arg_t) (1.0 / 17.0) * power;

	// iteration 18
	power *= mantissa;
	result = result - (arg_t) (1.0 / 18.0) * power;

	// iteration 19
	power *= mantissa;
	result = result + (arg_t) (1.0 / 19.0) * power;

	// iteration 20
	power *= mantissa;
	result = result - (arg_t) (1.0 / 20.0) * power;

	// iteration 21
	power *= mantissa;
	result = result + (arg_t) (1.0 / 21.0) * power;

	// iteration 22
	power *= mantissa;
	result = result - (arg_t) (1.0 / 22.0) * power;

	// iteration 23
	power *= mantissa;
	result = result + (arg_t) (1.0 / 23.0) * power;

	// iteration 24
	power *= mantissa;
	result = result - (arg_t) (1.0 / 24.0) * power;

	// iteration 25
	power *= mantissa;
	result = result + (arg_t) (1.0 / 25.0) * power;

	// iteration 26
	power *= mantissa;
	result = result - (arg_t) (1.0 / 26.0) * power;

	// TODO can adjust number of iterations

	return result + reduced;
}

__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f);
}

/*
 * Taylor series with optimization, requires argument reduction,
 *
 * https://math.stackexchange.com/a/3383716
 */
result_t log_taylor(arg_t val)
{
	REDUCE_ARGUMENT_TO_0_1

	result_t result = 0;
	// iteration 1
	arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0);
	arg_t factor = tmp * tmp;
#pragma loop unroll
	for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations
	{
		result += tmp / (arg_t) iteration;
		tmp *= factor;
	}
	return result + reduced;
}

__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log_taylor(in[gid]);
}

/*
 * Using the arithmetic-geometric-mean,
 *
 * https://en.wikipedia.org/wiki/Natural_logarithm#High_precision
 */
result_t log_agm(arg_t val)
{
	const unsigned m = 8; // TODO can adjust for precision
	arg_t s = val * (arg_t) (1 << m);
	arg_t mean = agm(1.0, (arg_t) 4.0 / s);
	return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2);
}

__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log_agm(in[gid]);
}

result_t log_agm_reduced(arg_t val)
{
	REDUCE_ARGUMENT_TO_0_1

	const unsigned m = 8; // TODO can adjust for precision
	arg_t s = mantissa * (arg_t) (1 << m);
	arg_t mean = agm(1.0, (arg_t) 4.0 / s);
	return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced;
}

__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log_agm_reduced(in[gid]);
}

__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
	uint gid = get_global_id(0);
	out[gid] = log(in[gid]);
}