forked from Qortal/Brooklyn
1667 lines
52 KiB
C
1667 lines
52 KiB
C
/*
|
||
* Author: doe300
|
||
*
|
||
* See the file "LICENSE" for the full license governing this code.
|
||
*/
|
||
|
||
#ifndef VC4CL_MATH_H
|
||
#define VC4CL_MATH_H
|
||
|
||
#include "_common.h"
|
||
#include "_intrinsics.h"
|
||
#include "_float_float.h"
|
||
|
||
// TODO test-cases for all the known Edge Case Behavior
|
||
// TODO can remove nan-check from rounding functions? The large value check should also contain NaNs,
|
||
// since VC4 considers NaNs to be the "largest" values
|
||
// TODO recheck NaN handling for functions based on multiplication/division once they handle NaNs correctly,
|
||
// maybe we can remove some of those extra codes
|
||
|
||
/*
|
||
* Remove some macros for more efficient versions
|
||
*/
|
||
#undef rsqrt // via own function
|
||
#undef ceil
|
||
#undef fabs
|
||
#undef floor
|
||
#undef fma
|
||
#undef rint
|
||
#undef round
|
||
#undef trunc
|
||
#undef half_cos
|
||
#undef half_divide
|
||
#undef half_exp
|
||
#undef half_exp2
|
||
#undef half_exp10
|
||
#undef half_log
|
||
#undef half_log2
|
||
#undef half_log10
|
||
#undef half_powr
|
||
#undef half_recip
|
||
#undef half_rsqrt
|
||
#undef half_sin
|
||
#undef half_sqrt
|
||
#undef half_tan
|
||
#undef native_cos
|
||
#undef native_divide // SFU RECIP
|
||
#undef native_exp // via SFU EXP2
|
||
#undef native_exp2 // SFU EXP2
|
||
#undef native_exp10 // via SFU EXP2
|
||
#undef native_log // via SFU LOG2
|
||
#undef native_log2 // SFU LOG2
|
||
#undef native_log10 // via SFU LOG2
|
||
#undef native_powr // via SFU_EXP2
|
||
#undef native_recip // SFU RECIP
|
||
#undef native_rsqrt // SFU RSQRT
|
||
#undef native_sin
|
||
#undef native_sqrt // via SFU RECIP and SFU RSQRT
|
||
#undef native_tan
|
||
|
||
// vc4cl_split(double) of M_LN2
|
||
#define M_LN2_FF 0xB102E3083F317218
|
||
// 1 / log10(e)
|
||
#define M_1_LOG10_E_F 2.30258512496948242188f
|
||
|
||
INLINE int factorial(int n) CONST
|
||
{
|
||
return (n == 1 || n == 0) ? 1 : factorial(n - 1) * n;
|
||
}
|
||
|
||
COMPLEX_3_SCALAR(float, fast_pown, float, val, uint, n, uchar, width, {
|
||
// reference: http://dimacs.rutgers.edu/archive/Workshops/Security/program2/quisquater/node3.html (method I)
|
||
result_t result = 1.0f;
|
||
result_t tmp = val;
|
||
for(uchar i = 0; i < width; ++i)
|
||
{
|
||
// if n[i] == 1 then z = z * y
|
||
result *= ((n >> i) & 1) ? tmp : 1.0f;
|
||
// y = y * y
|
||
tmp = tmp * tmp;
|
||
}
|
||
return result;
|
||
})
|
||
|
||
COMPLEX_1(float, vc4cl_pow2, int, val, {
|
||
// y = 2^x = 1.0 [implied] * 2^(x + offset)
|
||
int_t tmp = val << 23;
|
||
// alternative: tmp = (val + 127) << 23;
|
||
tmp += (int_t) 0x3F800000;
|
||
return vc4cl_bitcast_float(tmp & (int_t) 0x7F800000);
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* acos(1) = +-0
|
||
* acos(x) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, acos, float, val, {
|
||
result_t result = M_PI_2_F - asin(val);
|
||
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* acosh(1) = +0
|
||
* acosh(x) = NaN for x < 1
|
||
* acosh(+Inf) = +Inf
|
||
*/
|
||
// https://en.wikipedia.org/wiki/Inverse_hyperbolic_functions#Series_expansions
|
||
// or: https://en.wikipedia.org/wiki/Hyperbolic_function#Inverse_functions_as_logarithms
|
||
// XXX optimize?!
|
||
SIMPLE_1(float, acosh, float, val, log(val + sqrt(val * val + 1)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* acospi(1) = +-0
|
||
* acospi(x) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, acospi, float, val, {
|
||
result_t result = acos(val) * M_1_PI_F;
|
||
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* asin(+-0) = +-0
|
||
* asin(x) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, asin, float, val, {
|
||
result_t result = atan(val *rsqrt(1 - val * val));
|
||
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* asinh(+-0) = +-0
|
||
* asinh(+-Inf) = +-Inf
|
||
*/
|
||
// XXX optimize?!
|
||
SIMPLE_1(float, asinh, float, val, log(val + sqrt(val * val - 1)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* asinpi(+-0) = +-0
|
||
* asinpi(x) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, asinpi, float, val, {
|
||
result_t result = asin(val) * M_1_PI_F;
|
||
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
// https://stackoverflow.com/questions/23047978/how-is-arctan-implemented
|
||
// https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence (too inaccurate!)
|
||
// http://www2.mae.ufl.edu/~uhk/ARCTAN-APPROX-PAPER.pdf (atan(x) = 1/x * F(1/x))
|
||
// http://www.jjj.de/fxt/fxtbook.pdf
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* atan(+-0) = +-0
|
||
* atan(+-Inf) = +-pi/2
|
||
*
|
||
* Argument reduction:
|
||
* 1) atan(-x) = -atan(x)
|
||
* 2) atan(1/x) = PI/2 - atan(x), x > 0
|
||
* 3) atan(1/x) = -PI/2 - atan(x), x < 0
|
||
* 4) atan(x) = 2 atan(x / (1 + sqrt(1 + x^2)))
|
||
*
|
||
* Matters Computational (section 32.1.2.3): Pade Approximation (6 steps)
|
||
* Has an error of ~ 0 for |x| <= 1/8 * PI:
|
||
* (1155∙x + 1190∙x^3 + 231∙x^5)/(1155 + 1575∙x^2+525∙x^4+25∙x^6)
|
||
*
|
||
* Alternatively could be calculated via:
|
||
* atan(x) = atan2(x, 1)
|
||
*/
|
||
COMPLEX_1(float, atan, float, val, {
|
||
// reduce arguments a few times (r = r/(1+sqrt(1+r^2)))
|
||
uint n = 2;
|
||
result_t r = val;
|
||
for(uint i = 0; i < n; ++i)
|
||
r = r / (1 + sqrt(1 + r * r));
|
||
|
||
// compute the pade approximation
|
||
result_t approx = (1155 * r + 1190 * r * r * r + 231 * r * r * r * r * r) /
|
||
(1155 + 1575 * r * r + 525 * r * r * r * r + 25 * r * r * r * r * r * r);
|
||
|
||
// calculate the result (2^n * approx)
|
||
result_t result = ((result_t)(2 << (n - 1))) * approx;
|
||
// atan(x) with x >= 2^26 ~ atan(Inf) => pi/2
|
||
result = (fabs(val) >= (result_t)0x1p+26f) ? copysign(M_PI_2_F, val) : result;
|
||
// atan(+-NaN) = +-NaN
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* https://en.wikipedia.org/wiki/Atan2#Definition_and_computation
|
||
*
|
||
* Expected behavior:
|
||
*
|
||
* atan2(+-0, -0) = +-pi
|
||
* atan2(+-0, +0) = +-0
|
||
* atan2(+-0, x) = +-pi for x < 0
|
||
* atan2(+-0, x) = +-0 for x > 0
|
||
* atan2(y, +-0) = -pi/2 for y < 0
|
||
* atan2(y, +-0) = pi/2 for y > 0
|
||
* atan2(+-y, -Inf) = +-pi for y > 0 and y != Inf
|
||
* atan2(+-y, +Inf) = +-0 for y > 0 and y != Inf
|
||
* atan2(+-Inf, x) = +-pi/2 for x != Inf
|
||
* atan2(+-Inf, -Inf) = +-3/4pi
|
||
* atan2(+-Inf, +Inf) = +-pi/4
|
||
*/
|
||
COMPLEX_2(float, atan2, float, y, float, x, {
|
||
// note: these calculation work since the condition is either 0 or all bits set
|
||
result_t tan = atan(y / x);
|
||
result_t s = sign(y);
|
||
int_t xG0 = x > 0;
|
||
int_t xL0 = x < 0;
|
||
int_t yEq0 = y == 0;
|
||
// arctan(y / x) if x > 0
|
||
int_t res0 = xG0 & vc4cl_bitcast_int(tan);
|
||
// arctan(y / x) + sign(y) * pi if x < 0, y != 0
|
||
int_t res1 = xL0 & ~yEq0 & vc4cl_bitcast_int(tan + s * M_PI_F);
|
||
// pi if x < 0, y = 0
|
||
int_t res2 = xL0 & yEq0 & vc4cl_bitcast_int(M_PI_F);
|
||
// sign(y) * pi/2 if x = 0, y != 0
|
||
int_t res3 = (x == 0) & ~yEq0 & vc4cl_bitcast_int(s * M_PI_2_F);
|
||
// undefined if x = 0, y = 0
|
||
return vc4cl_bitcast_float(res0 | res1 | res2 | res3);
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* atanh(+-0) = +-0
|
||
* atanh(+-1) = +-Inf
|
||
* atanh(x) = NaN for |x| > 1
|
||
*/
|
||
SIMPLE_1(float, atanh, float, val, ((result_t) 0.5f) * log((1 + val) / (1 - val)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* atanpi(+-0) = +-0
|
||
* atanpi(+-Inf) = +-0.5
|
||
*/
|
||
SIMPLE_1(float, atanpi, float, val, isnan(val) ? val : atan(val) * M_1_PI_F)
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* atan2pi(+-0, -0) = +-1
|
||
* atan2pi(+-0, +0) = +-0
|
||
* atan2pi(+-0, x) = +-1 for x < 0
|
||
* atan2pi(+-0, x) = +-0 for x > 0
|
||
* atan2pi(y, +-0) = -0.5 for y < 0
|
||
* atan2pi(y, +-0) = 0.5 for y > 0
|
||
* atan2pi(+-y, -Inf) = +-1 for y > 0
|
||
* atan2pi(+-y, +Inf) = +-0 for y > 0
|
||
* atan2pi(+-Inf, x) = +-0.5
|
||
* atan2pi(+-Inf, -Inf) = +-0.75
|
||
* atan2pi(+-Inf, +Inf) = +-0.25
|
||
*/
|
||
SIMPLE_2(float, atan2pi, float, x, float, y, atan2(x, y) * M_1_PI_F)
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* cbrt(+-0) = +-0
|
||
* cbrt(+-Inf) = +-Inf
|
||
*/
|
||
// TODO different algorithm?
|
||
// e.g. http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt (acbrt1 with a few more Newton steps, but requires several
|
||
// floating-point divisions)
|
||
SIMPLE_1(float, cbrt, float, val, pow(val, 1.0f / 3.0f))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* ceil(+-0) = +-0
|
||
* ceil(+-Inf) = +-Inf
|
||
* ceil(x) = -0 for -1 < x < 0
|
||
*/
|
||
COMPLEX_1(float, ceil, float, val, {
|
||
//"Round to integer toward + infinity. "
|
||
|
||
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
|
||
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
|
||
// all floating point values |val| > 2^23 are exact numbers
|
||
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
|
||
// |val| < 2^23 fits into integer
|
||
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
|
||
// if the truncation is smaller than val (positive numbers), add 1 to round up
|
||
result_t ceiling = truncated + (truncated < val ? (result_t)1.0f: (result_t)0.0f);
|
||
// fix for ceil([-0.9, -0.0]) = -0.0
|
||
result_t result = copysign(ceiling, val);
|
||
// fix for negative subnormal values
|
||
result = result < val ? result + (result_t) 1.0f : result;
|
||
// deciding here which value to return saves us up to two jumps
|
||
return tooBig ? val : result;
|
||
})
|
||
|
||
//"Returns x with its sign changed to match the sign of y. "
|
||
// sign = y & 0x80000000
|
||
// tmp = x & 0x7FFFFFFF
|
||
// return tmp | sign
|
||
SIMPLE_2(float, copysign, float, x, float, y,
|
||
vc4cl_bitcast_float((vc4cl_bitcast_uint(y) & 0x80000000) | (vc4cl_bitcast_uint(x) & 0x7FFFFFFF)))
|
||
|
||
COMPLEX_1(float, cos_pade, float, val, {
|
||
/*
|
||
* Pade approximation
|
||
* (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 86)
|
||
*
|
||
* Has a relative error of far less than 2^23 (1 ULP) for [-PI/2, PI/2], then rises fast
|
||
*
|
||
* Alternative:
|
||
* (see alternative for sin_pade)
|
||
*/
|
||
arg_t a = 45469.0f * val * val * val * val * val * val * val * val;
|
||
arg_t b = 7029024.0f * val * val * val * val * val * val;
|
||
arg_t c = 348731040.0f * val * val * val * val;
|
||
arg_t d = 5269904640.0f * val * val;
|
||
arg_t e = 10983772800.0f;
|
||
arg_t f = 9336.0f * val * val * val * val * val * val;
|
||
arg_t g = 2064720.0f * val * val * val * val;
|
||
arg_t h = 221981760.0f * val * val;
|
||
arg_t up = a - b + c - d + e;
|
||
arg_t down = f + g + h + e;
|
||
return up / down;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* cos(+-0) = 1
|
||
* cos(+-Inf) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, cos, float, val, {
|
||
/*
|
||
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
|
||
*
|
||
* We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate.
|
||
*/
|
||
// TODO normalization into [-pi/2, pi/2] is too inaccurate for large values!
|
||
|
||
// Since cosine has a period of 2pi, these rewrites do not change the result (rounding error excluded):
|
||
// bring into range [-2pi, 2pi]
|
||
arg_t modifiedVal = fmod(val, 2.0f * M_PI_F);
|
||
// bring into range [-pi, pi]
|
||
modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal;
|
||
modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal;
|
||
|
||
// We move by half a period, so we must invert the sign of the result
|
||
// bring into range [-pi/2, pi/2]
|
||
int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F;
|
||
modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal;
|
||
modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal;
|
||
|
||
arg_t result = cos_pade(invertSign ? -modifiedVal : modifiedVal);
|
||
result = invertSign ? -result : result;
|
||
result = isinf(val) ? copysign(nan(0), val) : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* cosh(+-0) = 1
|
||
* cosh(+-Inf) = +-Inf
|
||
*/
|
||
COMPLEX_1(float, cosh, float, val, {
|
||
/*
|
||
* https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence
|
||
*
|
||
* This version (the Taylor-series up to x^16/16!) has following accuracy:
|
||
*
|
||
* cosh(PI) -> error of 1.5*10^-7 < ulp(PI) = 2*10^-7
|
||
* cosh(0.5f) -> error of < 10^-9 < ulp(0.5f) = 5*10^-8
|
||
* cosh(0.0f) -> error of < 10^-46 < ulp(0.0f) = 1*10^-45
|
||
*
|
||
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
|
||
*/
|
||
const result_t fac2 = 1.0f / factorial(2);
|
||
const result_t fac4 = 1.0f / factorial(4);
|
||
const result_t fac6 = 1.0f / factorial(6);
|
||
const result_t fac8 = 1.0f / factorial(8);
|
||
const result_t fac10 = 1.0f / factorial(10);
|
||
const result_t fac12 = 1.0f / factorial(12);
|
||
const result_t fac14 = 1.0f / factorial(14);
|
||
const result_t fac16 = 1.0f / factorial(16);
|
||
|
||
/*
|
||
* Argument reduction
|
||
*
|
||
* Matters Computational, section 32.2.3
|
||
*/
|
||
const uint n = 12;
|
||
// r = x / 2^n
|
||
const result_t inv2N = 1.0f / (1 << n);
|
||
result_t r = val * inv2N;
|
||
|
||
// C = cosh(r) - 1
|
||
result_t x = r;
|
||
|
||
result_t x2 = x * x;
|
||
result_t tmp1 = x2 * fac2;
|
||
// cosh(x) = 1 + x^2/2! + ...
|
||
result_t tmp = 1 + tmp1;
|
||
result_t x4 = x2 * x * x;
|
||
tmp1 = x4 * fac4;
|
||
//... + x^4/4! + ...
|
||
tmp = x + tmp1;
|
||
result_t x6 = x4 * x * x;
|
||
tmp1 = x6 * fac6;
|
||
//... + x^6/6! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x8 = x6 * x * x;
|
||
tmp1 = x8 + fac8;
|
||
//... + x^8/8! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x10 = x8 * x * x;
|
||
tmp1 = x10 * fac10;
|
||
//... + x^10/10! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x12 = x10 * x * x;
|
||
tmp1 = x12 + fac12;
|
||
//... + x^12/12! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x14 = x12 * x * x;
|
||
tmp1 = x14 * fac14;
|
||
//... + x^14/14! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x16 = x14 * x * x;
|
||
tmp1 = x16 + fac16;
|
||
//... + x^16/16!
|
||
tmp = tmp + tmp1;
|
||
|
||
result_t C = tmp - 1;
|
||
for(uint i = 0; i < n; ++i)
|
||
C = 2 * (C + 1) * (C + 1) - 2;
|
||
return C + 1;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* cospi(+-0) = 1
|
||
* cospi(x) = +0 for x = n + 0.5 and n is integer
|
||
* cospi(+-Inf) = NaN
|
||
*/
|
||
SIMPLE_1(float, cospi, float, val, cos(val *M_PI_F))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* erfc(-Inf) = 2
|
||
* erfc(+Inf) = 0
|
||
*/
|
||
SIMPLE_1(float, erfc, float, x, 1 - erf(x))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* erf(+-0) = +-0
|
||
* erf(+-Inf) = +-1
|
||
*/
|
||
COMPLEX_1(float, erf, float, x, {
|
||
/*
|
||
* https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
|
||
*
|
||
* has a maximal error of 1.2 * 10^-7
|
||
*
|
||
* alternative: http://mathworld.wolfram.com/Erf.html
|
||
*/
|
||
const result_t t = 1 / (1 + 0.5f * fabs(x));
|
||
|
||
const result_t r = t *
|
||
exp(-x * x - 1.26551223f + 1.00002368f * t + 0.37409196f * t * t + 0.09678418f * t * t * t -
|
||
0.18628806f * t * t * t * t + 0.27886807f * t * t * t * t * t - 1.13520398f * t * t * t * t * t * t +
|
||
1.48851587f * t * t * t * t * t * t * t - 0.82215223f * t * t * t * t * t * t * t * t +
|
||
0.17087277f * t * t * t * t * t * t * t * t * t);
|
||
|
||
return x < 0 ? r - 1 : 1 - r;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* exp(+-0) = 1
|
||
* exp(-Inf) = 0
|
||
* exp(+Inf) = +Inf
|
||
*
|
||
* Chebyshev interpolation with monomial basis and range reduction,
|
||
*
|
||
* https://www.pseudorandom.com/implementing-exp#section-18
|
||
*/
|
||
COMPLEX_1(float, exp, float, val, {
|
||
|
||
arg_t positive = fabs(val);
|
||
|
||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||
|
||
// Use floor() instead of ceil() to not reach Inf later on for e^(~10^38)
|
||
int_t k = vc4cl_ftoi((positive / M_LN2_F) - 0.5f);
|
||
arg_t kFloat = vc4cl_itof(k);
|
||
// calculate in extended precision to reduce rounding error
|
||
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
|
||
|
||
arg_t pn = 1.143364767943110e-11;
|
||
|
||
pn = pn * r + 1.632461784798319e-10f;
|
||
pn = pn * r + 2.088459690899721e-9f;
|
||
pn = pn * r + 2.504861486483735e-8f;
|
||
pn = pn * r + 2.755715675968011e-7f;
|
||
pn = pn * r + 2.755734045527853e-6f;
|
||
pn = pn * r + 2.480158866546844e-5f;
|
||
pn = pn * r + 1.984126978734782e-4f;
|
||
pn = pn * r + 0.001388888888388f;
|
||
pn = pn * r + 0.008333333333342f;
|
||
pn = pn * r + 0.041666666666727f;
|
||
pn = pn * r + 0.166666666666680f;
|
||
pn = pn * r + 0.500000000000002f;
|
||
pn = pn * r + 1.000000000000000f;
|
||
pn = pn * r + 1.000000000000000f;
|
||
|
||
pn = pn * vc4cl_pow2(k);
|
||
result_t result = val < 0 ? 1 / pn : pn;
|
||
|
||
result = vc4cl_is_zero(val) ? (result_t)1.0f : result;
|
||
result = val <= (result_t)-88.0f ? (result_t)0.0f : result;
|
||
result = val >= (result_t)89.0f ? (result_t)INFINITY : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* exp2(+-0) = 1
|
||
* exp2(-Inf) = 0
|
||
* exp2(+Inf) = +Inf
|
||
*/
|
||
|
||
COMPLEX_1(float, exp2, float, val, {
|
||
// 2^x = e^(x * ln(2))
|
||
result_t result = exp(val * M_LN2_F);
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* exp10(+-0) = 1
|
||
* exp10(-Inf) = 0
|
||
* exp10(+Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, exp10, float, val, {
|
||
// 10^x = e^(x / log10(e))
|
||
result_t result = exp(val * M_1_LOG10_E_F);
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* expm1(+-0) = +-0
|
||
* expm1(-Inf) = -1
|
||
* expm1(+Inf) = +Inf
|
||
*/
|
||
// e^x - 1.0
|
||
COMPLEX_1(float, expm1, float, val, {
|
||
result_t result = exp(val) - 1.0f;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* fabs(+-0) = +0
|
||
* fabs(+-Inf) = +Inf
|
||
*/
|
||
SIMPLE_1(float, fabs, float, val, vc4cl_fmaxabs(val, 0.0f))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* fdim(x, NaN) = NaN
|
||
* fdim(NaN, y) = NaN
|
||
*/
|
||
//"x-y if x>y, +0 if x is less than or equal to y."
|
||
COMPLEX_2(float, fdim, float, x, float, y, {
|
||
result_t result = x > y ? x - y : 0.0f;
|
||
result = isnan(x) ? x : result;
|
||
return isnan(y) ? y : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* floor(+-0) = +-0
|
||
* floor(+-Inf) = +-Inf
|
||
*/
|
||
COMPLEX_1(float, floor, float, val, {
|
||
//" Round to integer toward negative infinity. "
|
||
|
||
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
|
||
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
|
||
// all floating point values |val| > 2^23 are exact numbers
|
||
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
|
||
// |val| < 2^23 fits into integer
|
||
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
|
||
// if the truncation is greater than val (negative numbers), subtract 1 to round down
|
||
result_t floor_val = truncated - (truncated > val ? (result_t)1.0f : (result_t)0.0f);
|
||
// deciding here which value to return saves us up to two jumps
|
||
return tooBig ? val : floor_val;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* fma(x, y, z) = xy + z
|
||
* fma(x, y, z) = NaN for x = 0 or y = 0
|
||
* fma(x, y, z)= NaN for x = Inf, y = Inf, z = Inf
|
||
*/
|
||
// TODO not "infinitely precise product" and maybe not "correctly rounded"
|
||
SIMPLE_3(float, fma, float, a, float, b, float, c, (a * b) + c)
|
||
|
||
// The fmax/min ALU operations (vc4cl_fmax/vc4cl_fmin) handle NaNs as bigger then all, we need to handle them as:
|
||
// " If one argument is a NaN, fmin/fmax() returns the other argument. If both arguments are NaNs, fmin/fmax() returns a NaN."
|
||
SIMPLE_2(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y))
|
||
SIMPLE_2_SCALAR(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y))
|
||
|
||
SIMPLE_2(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y))
|
||
SIMPLE_2_SCALAR(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* fmod(+-0, y) = +-0 for y != 0
|
||
* fmod(x, y) = NaN for x = +-Inf or y = 0
|
||
* fmod(x, +-Inf) = x
|
||
* fmod(+-0, NaN) = NaN
|
||
*/
|
||
//"Modulus. Returns x - y * trunc(x/y)"
|
||
COMPLEX_2(float, fmod, float, x, float, y, {
|
||
result_t result = x - y * trunc(x / y);
|
||
result = (isinf(x) || sign(y) == (result_t)0.0f) ? (result_t)nan(0) : result;
|
||
result = (isinf(y) || isnan(x)) ? x : result;
|
||
return isnan(y) ? y : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* 0 <= fract(x, iptr) < 1
|
||
* fract(+0, iptr) = +0, iptr = +0
|
||
* fract(-0, iptr) = -0, iptr = -0
|
||
* fract(+Inf, iptr) = +0, iptr = +Inf
|
||
* fract(-Inf, iptr) = -0, iptr = -Inf
|
||
* fract(NaN, iptr) = NaN, iptr = NaN
|
||
*/
|
||
COMPLEX_2(float, fract, float, val, __global float, *iptr, {
|
||
//"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
|
||
result_t tmp = floor(val);
|
||
*iptr = tmp;
|
||
tmp = fmin(val - tmp, 0x1.fffffep-1f);
|
||
return isnan(val) ? val : tmp;
|
||
})
|
||
|
||
COMPLEX_2(float, fract, float, val, __local float, *iptr, {
|
||
//"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
|
||
result_t tmp = floor(val);
|
||
*iptr = tmp;
|
||
tmp = fmin(val - tmp, 0x1.fffffep-1f);
|
||
return isnan(val) ? val : tmp;
|
||
})
|
||
|
||
COMPLEX_2(float, fract, float, val, __private float, *iptr, {
|
||
//"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
|
||
result_t tmp = floor(val);
|
||
*iptr = tmp;
|
||
tmp = fmin(val - tmp, 0x1.fffffep-1f);
|
||
return isnan(val) ? val : tmp;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* frexp(+-0, exp) = +-0, exp = 0
|
||
* frexp(+-Inf, exp) = +-Inf, exp = 0
|
||
* frexp(NaN, exp) = NaN, exp = 0
|
||
*/
|
||
COMPLEX_2(float, frexp, float, x, __global int, *exp, {
|
||
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
|
||
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
|
||
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
|
||
// we need to support subnormals here, since they are only used as input, not output
|
||
int_t mantissa = absBits & (int_t) 0x007FFFFF;
|
||
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
|
||
// the next lines are required to fix-up the mantissa for our added subnormal exponent
|
||
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
|
||
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
|
||
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = mantissa | (int_t)0x3F000000;
|
||
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
|
||
*exp = specialCase ? (int_t)0 : exponent;
|
||
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
|
||
})
|
||
|
||
COMPLEX_2(float, frexp, float, x, __local int, *exp, {
|
||
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
|
||
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
|
||
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
|
||
// we need to support subnormals here, since they are only used as input, not output
|
||
int_t mantissa = absBits & (int_t) 0x007FFFFF;
|
||
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
|
||
// the next lines are required to fix-up the mantissa for our added subnormal exponent
|
||
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
|
||
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
|
||
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = mantissa | (int_t)0x3F000000;
|
||
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
|
||
*exp = specialCase ? (int_t)0 : exponent;
|
||
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
|
||
})
|
||
|
||
COMPLEX_2(float, frexp, float, x, __private int, *exp, {
|
||
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
|
||
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
|
||
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
|
||
// we need to support subnormals here, since they are only used as input, not output
|
||
int_t mantissa = absBits & (int_t) 0x007FFFFF;
|
||
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
|
||
// the next lines are required to fix-up the mantissa for our added subnormal exponent
|
||
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
|
||
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
|
||
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = mantissa | (int_t)0x3F000000;
|
||
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
|
||
*exp = specialCase ? (int_t)0 : exponent;
|
||
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* hypot(x, y) = hypot(y, x) = hypot(x, -y)
|
||
* hypot(x, +-0) = fabs(x)
|
||
* hypot(+-Inf, y) = +Inf
|
||
*/
|
||
SIMPLE_2(float, hypot, float, x, float, y, sqrt(x *x + y * y))
|
||
|
||
/**
|
||
* Expected behavior (C99 standard):
|
||
*
|
||
* ilogb(0) = FP_ILOGB0
|
||
* ilogb(+-Inf) = INT_MAX
|
||
* ilogb(+-NaN) = FP_ILOGBNAN
|
||
*/
|
||
COMPLEX_1(int, ilogb, float, x, {
|
||
//"Return the exponent as an integer value."
|
||
// https://en.wikipedia.org/wiki/Single-precision_floating-point_format
|
||
result_t bitcast = vc4cl_bitcast_int(x);
|
||
// deduct exponent offset
|
||
int_t result = ((bitcast >> 23) & 0xFF) - 127;
|
||
// we need to support subnormals here, since they are only used as input, not output
|
||
int_t mantissa = bitcast & (int_t) 0x007FFFFF;
|
||
result = result == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : result;
|
||
result = vc4cl_is_zero(x) ? (result_t)FP_ILOGB0 : result;
|
||
result = isinf(x) ? (result_t)INT_MAX : result;
|
||
return isnan(x) ? (result_t)FP_ILOGBNAN : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* ldexp(+-0, n) = +-0
|
||
* ldexp(x, 0) = x
|
||
* ldexp(+-Inf, n) = +-Inf
|
||
*/
|
||
//"Multiply x by 2 to the power k."
|
||
COMPLEX_2(float, ldexp, float, x, int, k, {
|
||
// Taken from MESA (https://gitlab.freedesktop.org/mesa/mesa/-/blob/master/src/compiler/nir/nir_opt_algebraic.py)
|
||
// Alternatively in https://git.musl-libc.org/cgit/musl/tree/src/math/scalbnf.c
|
||
int_t exp = min(max(k, -254), 254);
|
||
int_t pow2_1 = ((exp >> 1) + 127) << 23;
|
||
int_t pow2_2 = ((exp - (exp >> 1)) + 127) << 23;
|
||
result_t result = x * vc4cl_bitcast_float(pow2_1) * vc4cl_bitcast_float(pow2_2);
|
||
return vc4cl_is_inf_nan(x) ? x : result;
|
||
})
|
||
SIMPLE_2_SCALAR(float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof(1 << k) : 1.0f / vc4cl_itof(1 << -k)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* lgamma(1) = 0
|
||
* lgamma(2) = 0
|
||
* lgamma(x) = +Inf for x <= 0 and x integer
|
||
* lgamma(-Inf) = +Inf
|
||
* lgamma(+Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, lgamma, float, val, {
|
||
//"Returns the natural logarithm of the absolute value of the gamma function"
|
||
// see Numerical Recipes, chapter 6.1
|
||
// has error of < 2 * 10^-10
|
||
|
||
// XXX only for val > 0
|
||
// TODO far too inaccurate/wrong?
|
||
result_t x;
|
||
result_t y;
|
||
result_t tmp;
|
||
result_t ser;
|
||
|
||
y = x = val;
|
||
tmp = x + 5.5f;
|
||
tmp -= (x + 0.5f) * log(tmp);
|
||
ser = 1.000000000190015f;
|
||
y += 1;
|
||
ser += 76.18009172947146f / y;
|
||
y += 1;
|
||
ser += -86.50532032941677f / y;
|
||
y += 1;
|
||
ser += 24.01409824083091f / y;
|
||
y += 1;
|
||
ser += -1.231739572450155f / y;
|
||
y += 1;
|
||
ser += 0.1208650973866179e-2f / y;
|
||
y += 1;
|
||
ser += -0.5395239384953e-5f / y;
|
||
return -tmp + log((2.5066282746310005f) * ser / x);
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* lgamma_r(x, signp) -> signp = 0 for x < 0
|
||
*/
|
||
COMPLEX_2(float, lgamma_r, float, x, __global int, *signp, {
|
||
// TODO better way?
|
||
result_t tmp = lgamma(x);
|
||
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
|
||
return lgamma(x < 0.0f ? (-1.0f * x) : x);
|
||
})
|
||
COMPLEX_2(float, lgamma_r, float, x, __local int, *signp, {
|
||
// TODO better way?
|
||
result_t tmp = lgamma(x);
|
||
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
|
||
return lgamma(x < 0.0f ? (-1.0f * x) : x);
|
||
})
|
||
COMPLEX_2(float, lgamma_r, float, x, __private int, *signp, {
|
||
// TODO better way?
|
||
result_t tmp = lgamma(x);
|
||
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
|
||
return lgamma(x < 0.0f ? (-1.0f * x) : x);
|
||
})
|
||
|
||
COMPLEX_1(float, log1p_pade, float, val, {
|
||
/*
|
||
* Calculates ln(1 + x) via Pade approximation
|
||
* (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 72/96)
|
||
*
|
||
* Has a relative error of less than 2*2^23 (2 ULP) for range [-0.5, 2], then rises fast
|
||
*/
|
||
// g(x) = (49x^6+1218x^5+7980x^4+20720x^3+23100x^2+9240x)/(10x^6+420x^5+4200x^4+16800x^3+31500x^2+27720x+9240)
|
||
arg_t a = 49.0f * val * val * val * val * val * val;
|
||
arg_t b = 1218.0f * val * val * val * val * val;
|
||
arg_t c = 7980.0f * val * val * val * val;
|
||
arg_t d = 20720.0f * val * val * val;
|
||
arg_t e = 23100.0f * val * val;
|
||
arg_t f = 9240.0f * val;
|
||
arg_t g = 10.0f * val * val * val * val * val * val;
|
||
arg_t h = 420.0f * val * val * val * val * val;
|
||
arg_t i = 4200.0f * val * val * val * val;
|
||
arg_t k = 16800.0f * val * val * val;
|
||
arg_t l = 31500.0f * val * val;
|
||
arg_t m = 27720.0f * val;
|
||
arg_t n = 9240.0f;
|
||
arg_t up = a + b + c + d + e + f;
|
||
arg_t down = g + h + i + k + l + m + n;
|
||
return up / down;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* log2(+-0) = -Inf
|
||
* log2(1) = 0
|
||
* log2(x) = Nan for x < 0
|
||
* log2(+Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, log2, float, val, {
|
||
|
||
/* log2(x) = log2(M * e^E) = E + log2(M) */
|
||
int_t bitcast = vc4cl_bitcast_int(val);
|
||
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
|
||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
|
||
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
|
||
arg_t mantissa = vc4cl_bitcast_float(tmp);
|
||
|
||
/*
|
||
* Alternatively could use SFU and Newton-Raphson steps:
|
||
* f(x) = log2(x) - y
|
||
* root: f(x) = 0 -> log2(x) = y
|
||
* f'(x) = 1/(x * ln(2))
|
||
* xn+1 = xn - (log2(xn) - y) / (1/(xn * ln(2)))
|
||
* xn+1 = xn - xn * ln(2) * (log2(x) - y)
|
||
* => but don't know y which is exact log2(x)!!!
|
||
*
|
||
* Alternatively could approximate root of: f(y) = 2^y - x, need exact exp2(x)
|
||
*/
|
||
// result_t logMantissa = vc4cl_sfu_log2(mantissa);
|
||
|
||
// log2(M*2^E) = E + log2(M) = E + ln(M) / ln(2) = E + ln1p(M - 1) / ln(2)
|
||
result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN2_F;
|
||
|
||
result_t result = vc4cl_itof(exponent) + logMantissa;
|
||
result = vc4cl_is_inf_nan(val) ? val : result;
|
||
result = signbit(val) ? (result_t)nan(0) : result;
|
||
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
|
||
return result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* log(+-0) = -Inf
|
||
* log(1) = 0
|
||
* log(x) = Nan for x < 0
|
||
* log(+Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, log, float, val, {
|
||
/*
|
||
* Other sources/calculations:
|
||
* - Matters computational (sections 32.1.1 and 32.1.3)
|
||
* - https://math.stackexchange.com/questions/977586/is-there-an-approximation-to-the-natural-log-function-at-large-values
|
||
* - https://math.stackexchange.com/questions/1382070/iterative-calculation-of-log-x
|
||
* - https://evoq-eval.siam.org/Portals/0/Publications/SIURO/Vol7/New_Method_for_Approximating_Logarithms.pdf?ver=2018-04-06-151916-493
|
||
* - https://cs.stackexchange.com/questions/91185/compute-ex-given-starting-approximation
|
||
* - https://shoueiko.wordpress.com/2017/05/01/approximation-of-log-2/
|
||
* - https://math.stackexchange.com/questions/3381629/what-is-the-fastest-algorithm-for-finding-the-natural-logarithm-of-a-big-number
|
||
* - https://stackoverflow.com/questions/39821367/very-fast-approximate-logarithm-natural-log-function-in-c
|
||
*/
|
||
|
||
/* log(M * 2^E) = log(M) + E log(2) */
|
||
int_t bitcast = vc4cl_bitcast_int(val);
|
||
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
|
||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
|
||
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
|
||
arg_t mantissa = vc4cl_bitcast_float(tmp);
|
||
|
||
result_t logMantissa = log1p_pade(mantissa - 1.0f);
|
||
|
||
result_t result = vc4cl_itof(exponent) * M_LN2_F + logMantissa;
|
||
result = vc4cl_is_inf_nan(val) ? val : result;
|
||
result = signbit(val) ? (result_t)nan(0) : result;
|
||
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
|
||
return result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* log10(+-0) = -Inf
|
||
* log10(1) = 0
|
||
* log10(x) = Nan for x < 0
|
||
* log10(+Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, log10, float, val, {
|
||
/* log10(M * 2^E) = log10(M) + E log10(2) */
|
||
int_t bitcast = vc4cl_bitcast_int(val);
|
||
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
|
||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
|
||
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
|
||
arg_t mantissa = vc4cl_bitcast_float(tmp);
|
||
|
||
/* log10(M) = log(M) / log(10) */
|
||
result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN10_F;
|
||
|
||
const arg_t log2_10 = M_LN2_F / M_LN10_F;
|
||
result_t result = vc4cl_itof(exponent) * log2_10 + logMantissa;
|
||
result = vc4cl_is_inf_nan(val) ? val : result;
|
||
result = signbit(val) ? (result_t)nan(0) : result;
|
||
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
|
||
return result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* log1p(+-0) = +-0
|
||
* log1p(-1) = -Inf
|
||
* log1p(x) = Nan for x < -1
|
||
* log1p(+Inf) = +Inf
|
||
*/
|
||
//TODO directly use log1p_pade(), needs argument reduction
|
||
COMPLEX_1(float, log1p, float, val, {
|
||
result_t result = log(1.0f + val);
|
||
|
||
result = vc4cl_is_zero(val) ? val : result;
|
||
result = val == (arg_t)-1.0f ? (result_t)-INFINITY : result;
|
||
result = val < (arg_t)-1.0f ? (result_t)nan(0) : result;
|
||
result = vc4cl_is_inf_nan(val) ? val : result;
|
||
return result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* logb(+-0) = -Inf
|
||
* logb(+-Inf) = +Inf
|
||
*/
|
||
COMPLEX_1(float, logb, float, x, {
|
||
int_t bitcast = vc4cl_bitcast_int(x);
|
||
// deduct exponent offset
|
||
int_t exponent = ((bitcast >> 23) & 0xFF) - 127;
|
||
// we need to support subnormals here, since they are only used as input, not output
|
||
int_t mantissa = bitcast & (int_t) 0x007FFFFF;
|
||
exponent = exponent == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : exponent;
|
||
result_t result = vc4cl_itof(exponent);
|
||
result = vc4cl_is_zero(x) ? (result_t)-INFINITY : result;
|
||
result = isinf(x) ? fabs(x) : result;
|
||
return isnan(x) ? x : result;
|
||
})
|
||
|
||
SIMPLE_3(float, mad, float, a, float, b, float, c, (a * b) + c)
|
||
|
||
//"Returns x if |x|>|y|, y if |y|>|x|, otherwise fmax(x, y)"
|
||
COMPLEX_2(float, maxmag, float, x, float, y, {
|
||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||
result_t tmp = fmax(x, y);
|
||
result_t other = fabs(y) > fabs(x) ? y : tmp;
|
||
return fabs(x) > fabs(y) ? x : other;
|
||
})
|
||
//"Returns x if |x|<|y|, y if |y|<|x|, otherwise fmin(x, y)"
|
||
COMPLEX_2(float, minmag, float, x, float, y, {
|
||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||
result_t tmp = fmin(x, y);
|
||
result_t other = fabs(y) < fabs(x) ? y : tmp;
|
||
return fabs(x) < fabs(y) ? x : other;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* modf(+-x, iptr) = value with same sign as x
|
||
* modf(+-Inf, iptr) = +-0, iptr = +-Inf
|
||
* modf(NaN, iptr) = NaN, iptr = NaN
|
||
*/
|
||
// taken OpenCL C 1.2 specification, 7.5.2. Changes to C99 TC2 Behavior
|
||
COMPLEX_2(float, modf, float, value, __global float, *iptr, {
|
||
result_t tmp = trunc(value);
|
||
*iptr = isnan(value) ? value : tmp;
|
||
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
|
||
return isnan(value) ? value : result;
|
||
})
|
||
COMPLEX_2(float, modf, float, value, __local float, *iptr, {
|
||
result_t tmp = trunc(value);
|
||
*iptr = isnan(value) ? value : tmp;
|
||
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
|
||
return isnan(value) ? value : result;
|
||
})
|
||
COMPLEX_2(float, modf, float, value, __private float, *iptr, {
|
||
result_t tmp = trunc(value);
|
||
*iptr = isnan(value) ? value : tmp;
|
||
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
|
||
return isnan(value) ? value : result;
|
||
})
|
||
|
||
SIMPLE_1(float, nan, uint, nancode, vc4cl_bitcast_float(NAN | nancode))
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* nextafter(+smallest normal, y < +smallest normal) = +0
|
||
* nextafter(-smallest normal, y > -smallest normal) = -0
|
||
* nextafter(-0, y > 0) = smallest positive normal value
|
||
* nextafter(0, y < 0) = smallest negative normal value
|
||
*/
|
||
COMPLEX_2(float, nextafter, float, x, float, y, {
|
||
int_t ix = vc4cl_bitcast_int(x);
|
||
int_t iy = vc4cl_bitcast_int(y);
|
||
/* x > y -> x -= ulp otherwise x += ulp */
|
||
int_t xPos = ix > iy ? ix - 1 : ix + 1;
|
||
/* x < y -> x -= ulp otherwise x += ulp */
|
||
int_t xNeg = iy >= 0 || ix > iy ? ix - 1 : ix + 1;
|
||
int_t xNotY = ix >= 0 ? /* x > 0 */ xPos : /* x < 0 */ xNeg;
|
||
int_t res = x == y ? iy : xNotY;
|
||
result_t result = vc4cl_bitcast_float(res);
|
||
result = (x == (result_t)-0.0f && y > 0.0f) ? (result_t)FLT_MIN : result;
|
||
result = (x == (result_t)0.0f && y < 0.0f) ? (result_t)-FLT_MIN : result;
|
||
result = (x == (result_t)FLT_MIN && y < (result_t)FLT_MIN) ? (result) +0.0f : result;
|
||
result = (x == (result_t)-FLT_MIN && y > (result_t)-FLT_MIN) ? (result) -0.0f : result;
|
||
return isnan(x) ? x : (isnan(y) ? y : result);
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* pow(+-0, y) = +-Inf for y odd and y < 0
|
||
* pow(+-0, y) = +Inf for y even and y < 0
|
||
* pow(+-0, y) = +-0 for y odd and y > 0
|
||
* pow(+-0, y) = 0 for y even and y > 0
|
||
* pow(-1, +-Inf) = 1
|
||
* pow(1, y) = 1
|
||
* pow(x, +-0) = 1
|
||
* pow(x, y) = NaN for x < 0 and y not an integer
|
||
* pow(x, -Inf) = +Inf for |x| < 0
|
||
* pow(x, -Inf) = 0 for |x| > 0
|
||
* pow(x, +Inf) = 0 for |x| < 0
|
||
* pow(x, +Inf) = 0 for |x| > 0
|
||
* pow(-Inf, y) = -0 for y odd and y < 0
|
||
* pow(-Inf, y) = 0 for y even and y < 0
|
||
* pow(-Inf, y) = -Inf for y odd and y > 0
|
||
* pow(-Inf, y) = +Inf for y even and y > 0
|
||
* pow(+Inf, y) = 0 for y < 0
|
||
* pow(+Inf, y) = +Inf for y > 0
|
||
* pow(+-0, -Inf) =+Inf
|
||
*/
|
||
// for pow, see also https://stackoverflow.com/questions/4518011/algorithm-for-powfloat-float
|
||
COMPLEX_2(float, pow, float, x, float, y, {
|
||
result_t tmp = powr(x, y);
|
||
return y < 0.0f ? (result_t) 1.0f / tmp : tmp;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* pown(x, 0) = 1
|
||
* pown(+-0, n) = +-Inf for odd n and n < 0
|
||
* pown(+-0, n) = +Inf for even n and n < 0
|
||
* pown(+-0, n) = +-0 for odd n and n > 0
|
||
* pown(+-0, n) = 0 for even n and n > 0
|
||
*/
|
||
// TODO is there a way to not need to calculate both versions? Or at least calculate both at once?!
|
||
SIMPLE_2(float, pown, float, x, int, n,
|
||
n < 0 ? (1.0f / fast_pown(x, vc4cl_bitcast_uint(-n), 32)) : fast_pown(x, vc4cl_bitcast_uint(n), 32))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* powr(x, +-0) = 1 for x > 0
|
||
* powr(+-0, y) = +Inf for y < 0
|
||
* powr(+-0, -Inf) = +Inf
|
||
* powr(+-0, y) = +0 for y > 0
|
||
* powr(1, y) = 1
|
||
* powr(x, y) = NaN for x < 0
|
||
* powr(+-0, +-0) = NaN
|
||
* powr(+Inf, +-0) = NaN
|
||
* powr(1, +-Inf) = NaN
|
||
* powr(x, NaN) = NaN for x >= 0
|
||
* powr(NaN, y) = NaN
|
||
*/
|
||
//"Compute x to the power y, where x is >= 0."
|
||
// x^y = e^(y * ln(x))
|
||
SIMPLE_2(float, powr, float, x, float, y, exp(y *log(x)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* remainder(x, 0) = NaN or 0
|
||
* remainder(Inf, y) = NaN
|
||
* remainder(x, Inf) = x
|
||
*/
|
||
COMPLEX_2(float, remainder, float, x, float, y, {
|
||
// TODO correct?
|
||
result_t k = rint(x / y);
|
||
result_t result = x - k * y;
|
||
|
||
result = vc4cl_is_zero(y) ? (result_t)-nan(0) : result;
|
||
result = isinf(x) ? (result_t)nan(0) : result;
|
||
result = isnan(x) ? x : result;
|
||
return isnan(y) ? y : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* remquo(x, y, quo) = NaN, quo = 0 for x = +-Inf or y = 0 or x = NaN or y = NaN
|
||
* exp2(-Inf) = 0
|
||
* exp2(+Inf) = +Inf
|
||
*/
|
||
// taken from pocl (https://github.com/pocl/pocl/blob/master/lib/kernel/vecmathlib-pocl/remquo.cl)
|
||
COMPLEX_3(float, remquo, float, x, float, y, __global int, *quo, {
|
||
result_t k = rint(x / y);
|
||
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
|
||
return x - k * y;
|
||
})
|
||
COMPLEX_3(float, remquo, float, x, float, y, __local int, *quo, {
|
||
result_t k = rint(x / y);
|
||
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
|
||
return x - k * y;
|
||
})
|
||
COMPLEX_3(float, remquo, float, x, float, y, __private int, *quo, {
|
||
result_t k = rint(x / y);
|
||
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
|
||
return x - k * y;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* rint(x) = -0 for -0.5 <= x < 0
|
||
*/
|
||
COMPLEX_1(float, rint, float, val, {
|
||
//" Round to nearest even integer. "
|
||
// round like round, but decides on nearest even for halfway cases
|
||
// TODO always round to nearest even (e.g. 0.4 to 2.0) or just for half-way cases?
|
||
|
||
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
|
||
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
|
||
|
||
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
|
||
// |val| < 2^23 fits into integer
|
||
int_t integer = vc4cl_ftoi(val);
|
||
result_t truncated = vc4cl_itof(integer);
|
||
// for negative numbers, truncated is larger than value, for positive it is smaller
|
||
result_t diff = val - truncated;
|
||
// -> if the |difference| is > 0.5, subtract/add 1
|
||
result_t correction = 0.0f + (diff < -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff > 0.5f ? (result_t)1.0f : (result_t)0.0f);
|
||
// if the value is exactly the half, round to nearest even
|
||
int_t isHalfWay = diff == 0.5f || diff == -0.5f;
|
||
// if the truncated integer is even, use it. Otherwise (lsb of integer is 1) subtract/add 1
|
||
result_t halfWayCorrection = 0 + vc4cl_itof(integer & 1) * sign(val);
|
||
result_t result = truncated + correction + vc4cl_itof(isHalfWay) * halfWayCorrection;
|
||
// deciding here which value to return saves us up to two jumps
|
||
return tooBig ? val : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* rootn(+-0, n) = +-Inf for odd n < 0
|
||
* rootn(+-0, n) = +Inf for even n < 0
|
||
* rootn(+-0, n) = +-0 for odd n > 0
|
||
* rootn(+-0, n) = +0 for even n > 0
|
||
* rootn(x, n) = NaN for x < 0 and n even
|
||
* rootn(x, 0) = NaN
|
||
*/
|
||
// TODO different algorithm?
|
||
SIMPLE_2(float, rootn, float, x, int, y, pow(x, (arg0_t) 1.0f / vc4cl_itof(y)))
|
||
// see: https://en.wikipedia.org/wiki/Nth_root_algorithm
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* round(+-0) = +-0
|
||
* round(+-Inf) = +-Inf
|
||
* round(x) = -0 for -0.5 < x < 0
|
||
*/
|
||
COMPLEX_1(float, round, float, val, {
|
||
//" Return the integral value nearest to x rounding halfway cases away from zero, regardless of the current rounding
|
||
// direction. "
|
||
|
||
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
|
||
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
|
||
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
|
||
// |val| < 2^23 fits into integer
|
||
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
|
||
// for negative numbers, truncated is larger than value, for positive it is smaller
|
||
result_t diff = val - truncated;
|
||
// -> if the |difference| is >= 0.5, subtract/add 1
|
||
result_t correction = 0.0f + (diff <= -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff >= 0.5f ? (result_t)1.0f : (result_t)0.0f);
|
||
result_t result = truncated + correction;
|
||
// deciding here which value to return saves us up to two jumps
|
||
return tooBig ? val : result;
|
||
})
|
||
|
||
COMPLEX_1(float, rsqrt, float, x, {
|
||
// Using Goldschmidt's algorithm with 2 iterations
|
||
// see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9)
|
||
arg_t y = vc4cl_sfu_rsqrt(x);
|
||
arg_t g = x * y;
|
||
arg_t h = y * (arg_t)0.5f;
|
||
|
||
for(unsigned i = 0; i < 2; ++i) {
|
||
arg_t r = (arg_t)0.5f - g * h;
|
||
g = g + g * r;
|
||
h = h + h * r;
|
||
}
|
||
|
||
// rsqrt(+-0) = +-Inf
|
||
result_t result = vc4cl_is_zero(x) ? copysign((result_t)INFINITY, x) : (arg_t)2.0f * h;
|
||
// rsqrt(NaN) = NaN
|
||
result = isnan(x) ? x : result;
|
||
// rsqrt(Inf) = 0
|
||
result = isinf(x) ? (result_t)0.0f : result;
|
||
// rsqrt(x) for x < 0 = -NaN
|
||
result = x < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result;
|
||
return result;
|
||
})
|
||
|
||
COMPLEX_1(float, sin_pade, float, val, {
|
||
/*
|
||
* Pade approximation
|
||
* (https://en.wikipedia.org/wiki/Pad%C3%A9_approximant#Examples)
|
||
*
|
||
* Has a relative error of less than 2^23 (1 ULP) for range [-PI/2, PI/2], then rises fast
|
||
*
|
||
* Alternative:
|
||
* Mendenhall algorithm to calculate both sine and cosine:
|
||
* - Is also exact enough for range [-PI/2, PI/2]
|
||
* - https://arxiv.org/pdf/cs/0406049.pdf
|
||
* -
|
||
* https://github.com/broadcomCM/android_hardware_broadcom_brcm_usrlib/blob/ics/dag/vmcsx/middleware/khronos/glsl/glsl_mendenhall.c
|
||
*/
|
||
arg_t a = 12671.0f / 4363920.0f;
|
||
arg_t b = 2363.0f / 18183.0f;
|
||
arg_t c = 445.0f / 12122.0f;
|
||
arg_t d = 601.0f / 872784.0f;
|
||
arg_t e = 121.0f / 16662240.0f;
|
||
|
||
arg_t up = (a * val * val * val * val * val) - (b * val * val * val) + val;
|
||
arg_t down = 1.0f + (c * val * val) + (d * val * val * val * val) + (e * val * val * val * val * val * val);
|
||
return up / down;
|
||
})
|
||
|
||
COMPLEX_1(float, sin_fast, float, x, {
|
||
/*
|
||
* Alternative implementation, used by Mesa3D
|
||
* - less instructions to calculate
|
||
* - has greater error (even for range -pi/2, pi/2)
|
||
* https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/compiler/nir/nir_opt_algebraic.py
|
||
* https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
|
||
*/
|
||
const arg_t B = 4.0 / M_PI;
|
||
const arg_t C = -4.0 / (M_PI * M_PI);
|
||
|
||
arg_t y = B * x + C * x * fabs(x);
|
||
|
||
// const float Q = 0.775;
|
||
const arg_t P = (arg_t)0.225;
|
||
y = P * (y * fabs(y) - y) + y; // Q * y + P * y * abs(y)
|
||
return y;
|
||
})
|
||
|
||
//COMPLEX_1(float, sin_taylor, float, x, {
|
||
/*
|
||
* Alternative implementation, used by Mesa3D VC4 driver
|
||
* - calculates the Taylor series
|
||
* -
|
||
* https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/gallium/drivers/vc4/vc4_program.c (function ntq_fsin)
|
||
*/
|
||
//})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* sin(+-0) = +-0
|
||
* sin(+-Inf) = NaN for |x| > 1
|
||
*/
|
||
COMPLEX_1(float, sin, float, val, {
|
||
/*
|
||
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
|
||
*
|
||
* We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate.
|
||
*/
|
||
// TODO normalization into [-pi/2, pi/2] is too inaccurate for large values! fmod too inaccurate (~8 ULP, should be 0)
|
||
|
||
// Since sine has a period of 2pi, these rewrites do not change the result (rounding error excluded):
|
||
// bring into range [-2pi, 2pi]
|
||
arg_t modifiedVal = fmod(val, 2.0f * M_PI_F);
|
||
// bring into range [-pi, pi]
|
||
modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal;
|
||
modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal;
|
||
|
||
// We move by half a period, so we must invert the sign of the result
|
||
// bring into range [-pi/2, pi/2]
|
||
int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F;
|
||
modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal;
|
||
modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal;
|
||
|
||
arg_t result = sin_pade(modifiedVal);
|
||
return invertSign ? -result : result;
|
||
})
|
||
|
||
SIMPLE_2(float, sincos, float, x, __global float, *cosval, (*cosval = cos(x), sin(x)))
|
||
SIMPLE_2(float, sincos, float, x, __local float, *cosval, (*cosval = cos(x), sin(x)))
|
||
SIMPLE_2(float, sincos, float, x, __private float, *cosval, (*cosval = cos(x), sin(x)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* sinh(+-0) = +-0
|
||
* sinh(+-Inf) = +-Inf
|
||
*/
|
||
COMPLEX_1(float, sinh, float, val, {
|
||
/*
|
||
* https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence
|
||
*
|
||
* This version (the Taylor-series up to x^17/17!) has following accuracy:
|
||
*
|
||
* sinh(PI) -> error of 2*10^-8 < ulp(PI) = 2*10^-7
|
||
* sinh(0.5f) -> error of < 10^-10 < ulp(0.5f) = 5*10^-8
|
||
* sinh(0.0f) -> error of < 10^-48 < ulp(0.0f) = 1*10^-45
|
||
*
|
||
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
|
||
*/
|
||
// TODO inaccurate out of the range of -PI <= x <= PI
|
||
/*
|
||
* Alternatives:
|
||
* sinh(x) = sqrt((cosh(2x)-1)/2)
|
||
* sinh(x) = exp(x) - cosh(x)
|
||
*/
|
||
const result_t fac3 = 1.0f / factorial(3);
|
||
const result_t fac5 = 1.0f / factorial(5);
|
||
const result_t fac7 = 1.0f / factorial(7);
|
||
const result_t fac9 = 1.0f / factorial(9);
|
||
const result_t fac11 = 1.0f / factorial(11);
|
||
const result_t fac13 = 1.0f / factorial(13);
|
||
const result_t fac15 = 1.0f / factorial(15);
|
||
const result_t fac17 = 1.0f / factorial(17);
|
||
|
||
result_t x = val;
|
||
/// TODO optimize!!
|
||
/*
|
||
while(x < -M_PI_F)
|
||
{
|
||
x = x + 2 * M_PI_F;
|
||
}
|
||
while(x > M_PI_F)
|
||
{
|
||
x = x - 2 * M_PI_F;
|
||
}
|
||
*/
|
||
result_t x3 = x * x * x;
|
||
result_t tmp1 = x3 * fac3;
|
||
// sinh(x) = x + x^3/3! + ...
|
||
result_t tmp = x + tmp1;
|
||
result_t x5 = x3 * x * x;
|
||
tmp1 = x5 * fac5;
|
||
//... + x^5/5! + ...
|
||
tmp = x + tmp1;
|
||
result_t x7 = x5 * x * x;
|
||
tmp1 = x7 * fac7;
|
||
//... + x^7/7! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x9 = x7 * x * x;
|
||
tmp1 = x9 * fac9;
|
||
//... + x^9/9! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x11 = x9 * x * x;
|
||
tmp1 = x11 * fac11;
|
||
//... + x^11/11! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x13 = x11 * x * x;
|
||
tmp1 = x13 * fac13;
|
||
//... + x^13/13! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x15 = x13 * x * x;
|
||
tmp1 = x15 * fac15;
|
||
//... + x^15/15! + ...
|
||
tmp = tmp + tmp1;
|
||
result_t x17 = x15 * x * x;
|
||
tmp1 = x17 * fac17;
|
||
//... + x^17/17!
|
||
tmp = tmp + tmp1;
|
||
return tmp;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* sinpi(+-0) = +-0
|
||
* sinpi(n) = 0 for n integer and n > 0
|
||
* sinpi(n) = -0 for n integer and n < 0
|
||
* sinpi(+-Inf) = NaN
|
||
*/
|
||
SIMPLE_1(float, sinpi, float, val, sin(val *M_PI_F))
|
||
|
||
COMPLEX_1(float, sqrt, float, x, {
|
||
// Using Goldschmidt's algorithm with 2 iterations
|
||
// see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9)
|
||
arg_t y = vc4cl_sfu_rsqrt(x);
|
||
arg_t g = x * y;
|
||
arg_t h = y * (arg_t)0.5f;
|
||
|
||
for(unsigned i = 0; i < 2; ++i) {
|
||
arg_t r = (arg_t)0.5f - g * h;
|
||
g = g + g * r;
|
||
h = h + h * r;
|
||
}
|
||
|
||
// sqrt(NaN) = NaN, sqrt(Inf) = Inf
|
||
result_t result = vc4cl_is_inf_nan(x) ? x : g;
|
||
// sqrt(+-0) = +-0
|
||
result = vc4cl_is_zero(x) ? x : result;
|
||
// sqrt(x) = NaN, for x < 0
|
||
return x < 0.0f ? (result_t)nan(0) : result;
|
||
})
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* tan(+-0) = +-0
|
||
* tan(+-Inf) = NaN
|
||
*/
|
||
// TODO different algorithm?!
|
||
// Taylor Series is too inaccurate (converges too slow)
|
||
SIMPLE_1(float, tan, float, val, sin(val) / cos(val))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* tanh(+-0) = +-0
|
||
* tanh(+-Inf) = +-Inf
|
||
*/
|
||
// TODO different algorithm?!
|
||
// Taylor Series is too inaccurate (converges too slow)
|
||
SIMPLE_1(float, tanh, float, val, sinh(val) / cosh(val))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* tanpi(+-0) = +-0
|
||
* tanpi(+-Inf) = NaN
|
||
* tanpi(n) = copysign(0,n) for n even and n integer
|
||
* tanpi(n) = copysign(0, -n) for n odd and n integer
|
||
* tanpi(x) = +Inf for x = n + 0.5 and n even integer
|
||
* tanpi(x) = -Inf for x = n + 0.5 and n odd integer
|
||
*/
|
||
SIMPLE_1(float, tanpi, float, val, tan(val *M_PI_F))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* tgamma(+-0) = +-Inf
|
||
* tgamma(x) = NaN for x < 0 and x integer
|
||
* tgamma(-Inf) = NaN
|
||
* tgamma(+Inf) = +Inf
|
||
*/
|
||
// TODO different algorithm?! E.g. http://www.math.hkbu.edu.hk/support/aands/page_257.htm with argument reduction
|
||
// (previous page)
|
||
SIMPLE_1(float, tgamma, float, val, exp(lgamma(val)))
|
||
|
||
/**
|
||
* Expected behavior:
|
||
*
|
||
* trunc(+-0) = +-0
|
||
* trunc(+-Inf) = +-Inf
|
||
* trunc(x) = -0 for -1 < x < 0
|
||
*/
|
||
COMPLEX_1(float, trunc, float, val, {
|
||
//" Round to integer toward zero. "
|
||
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
|
||
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
|
||
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
|
||
// |val| < 2^23 fits into integer
|
||
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
|
||
// deciding here which value to return saves us up to two jumps
|
||
return tooBig ? val : truncated;
|
||
})
|
||
|
||
/*
|
||
* native functions
|
||
*/
|
||
SIMPLE_1(float, native_cos, float, val, cos(val))
|
||
|
||
SIMPLE_2(float, native_divide, float, x, float, y, x *native_recip(y))
|
||
|
||
// e^x = 2^(x * log2(e))
|
||
SIMPLE_1(float, native_exp, float, x, native_exp2(x *M_LOG2E_F))
|
||
|
||
SIMPLE_1(float, native_exp2, float, val, vc4cl_sfu_exp2(val))
|
||
|
||
// 10^x = 2^(x * log2(10))
|
||
SIMPLE_1(float, native_exp10, float, x, native_exp2(x *M_LOG210))
|
||
|
||
// log(x) = log2(x) / log2(e)
|
||
SIMPLE_1(float, native_log, float, x, native_log2(x) * (1.0f / M_LOG2E_F))
|
||
|
||
SIMPLE_1(float, native_log2, float, val, vc4cl_sfu_log2(val))
|
||
|
||
// log10(x) = log2(x) / log2(10)
|
||
SIMPLE_1(float, native_log10, float, x, native_log2(x) * (1.0f / M_LOG210))
|
||
|
||
//"Compute x to the power y, where x is >= 0."
|
||
// x^y = 2^(y * log2(x))
|
||
SIMPLE_2(float, native_powr, float, x, float, y, native_exp2(y *native_log2(x)))
|
||
|
||
SIMPLE_1(float, native_recip, float, val, vc4cl_sfu_recip(val))
|
||
SIMPLE_1(float, native_rsqrt, float, val, vc4cl_sfu_rsqrt(val))
|
||
|
||
SIMPLE_1(float, native_sin, float, val, sin(val))
|
||
|
||
SIMPLE_1(float, native_sqrt, float, val, val * vc4cl_sfu_rsqrt(val))
|
||
|
||
SIMPLE_1(float, native_tan, float, val, tan(val))
|
||
|
||
/*
|
||
* half functions, accuracy of 8192 ULP (11 MSB of the mantissa)
|
||
*/
|
||
SIMPLE_1(float, half_cos, float, val, cos(val))
|
||
SIMPLE_2(float, half_divide, float, x, float, y, x * vc4cl_sfu_recip(y))
|
||
SIMPLE_1(float, half_exp, float, val, native_exp(val))
|
||
COMPLEX_1(float, half_exp2, float, val, {
|
||
result_t result = vc4cl_sfu_exp2(val);
|
||
|
||
result = vc4cl_is_zero(val) ? (result_t)1.0f : result;
|
||
result = val < (result_t)-126.0f ? (result_t)0.0f : result;
|
||
result = val >= (result_t)128.0f ? (result_t)INFINITY : result;
|
||
return isnan(val) ? val : result;
|
||
})
|
||
SIMPLE_1(float, half_exp10, float, val, native_exp10(val))
|
||
SIMPLE_1(float, half_log, float, val, native_log(val))
|
||
COMPLEX_1(float, half_log2, float, val, {
|
||
/* log2(x) = log2(M * e^E) = E + log2(M) */
|
||
int_t bitcast = vc4cl_bitcast_int(val);
|
||
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
|
||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
|
||
|
||
/* mask off exponent and replace with exponent for range [0.5, 1) */
|
||
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
|
||
arg_t mantissa = vc4cl_bitcast_float(tmp);
|
||
result_t logMantissa = vc4cl_sfu_log2(mantissa);
|
||
|
||
result_t result = vc4cl_itof(exponent) + logMantissa;
|
||
result = vc4cl_is_inf_nan(val) ? val : result;
|
||
result = signbit(val) ? (result_t)nan(0) : result;
|
||
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
|
||
return result;
|
||
})
|
||
SIMPLE_1(float, half_log10, float, val, native_log10(val))
|
||
SIMPLE_2(float, half_powr, float, x, float, y, powr(x, y))
|
||
SIMPLE_1(float, half_recip, float, val, (arg_t)1.0f / val)
|
||
COMPLEX_1(float, half_rsqrt, float, val, {
|
||
arg_t x = vc4cl_sfu_rsqrt(val);
|
||
|
||
// rsqrt(+-0) = +-Inf
|
||
result_t result = vc4cl_is_zero(val) ? copysign((result_t)INFINITY, val) : x;
|
||
// rsqrt(NaN) = NaN
|
||
result = isnan(val) ? val : result;
|
||
// rsqrt(Inf) = 0
|
||
result = isinf(val) ? (result_t)0.0f : result;
|
||
// rsqrt(x) for x < 0 = -NaN
|
||
result = val < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result;
|
||
return result;
|
||
})
|
||
SIMPLE_1(float, half_sin, float, val, sin(val))
|
||
COMPLEX_1(float, half_sqrt, float, val, {
|
||
arg_t x = val * vc4cl_sfu_rsqrt(val);
|
||
|
||
// sqrt(NaN) = NaN, sqrt(Inf) = Inf
|
||
result_t result = vc4cl_is_inf_nan(val) ? val : x;
|
||
// sqrt(+-0) = +-0
|
||
result = vc4cl_is_zero(val) ? val : result;
|
||
// sqrt(x) = NaN, for x < 0
|
||
return val < 0.0f ? (result_t)nan(0) : result;
|
||
})
|
||
SIMPLE_1(float, half_tan, float, val, tan(val))
|
||
|
||
#endif /* VC4CL_MATH_H */
|