Raziel K. Crowe 1c4c363d5c VC4Stdlib
2022-09-09 19:57:08 +05:00

1667 lines
52 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_MATH_H
#define VC4CL_MATH_H
#include "_common.h"
#include "_intrinsics.h"
#include "_float_float.h"
// TODO test-cases for all the known Edge Case Behavior
// TODO can remove nan-check from rounding functions? The large value check should also contain NaNs,
// since VC4 considers NaNs to be the "largest" values
// TODO recheck NaN handling for functions based on multiplication/division once they handle NaNs correctly,
// maybe we can remove some of those extra codes
/*
* Remove some macros for more efficient versions
*/
#undef rsqrt // via own function
#undef ceil
#undef fabs
#undef floor
#undef fma
#undef rint
#undef round
#undef trunc
#undef half_cos
#undef half_divide
#undef half_exp
#undef half_exp2
#undef half_exp10
#undef half_log
#undef half_log2
#undef half_log10
#undef half_powr
#undef half_recip
#undef half_rsqrt
#undef half_sin
#undef half_sqrt
#undef half_tan
#undef native_cos
#undef native_divide // SFU RECIP
#undef native_exp // via SFU EXP2
#undef native_exp2 // SFU EXP2
#undef native_exp10 // via SFU EXP2
#undef native_log // via SFU LOG2
#undef native_log2 // SFU LOG2
#undef native_log10 // via SFU LOG2
#undef native_powr // via SFU_EXP2
#undef native_recip // SFU RECIP
#undef native_rsqrt // SFU RSQRT
#undef native_sin
#undef native_sqrt // via SFU RECIP and SFU RSQRT
#undef native_tan
// vc4cl_split(double) of M_LN2
#define M_LN2_FF 0xB102E3083F317218
// 1 / log10(e)
#define M_1_LOG10_E_F 2.30258512496948242188f
INLINE int factorial(int n) CONST
{
return (n == 1 || n == 0) ? 1 : factorial(n - 1) * n;
}
COMPLEX_3_SCALAR(float, fast_pown, float, val, uint, n, uchar, width, {
// reference: http://dimacs.rutgers.edu/archive/Workshops/Security/program2/quisquater/node3.html (method I)
result_t result = 1.0f;
result_t tmp = val;
for(uchar i = 0; i < width; ++i)
{
// if n[i] == 1 then z = z * y
result *= ((n >> i) & 1) ? tmp : 1.0f;
// y = y * y
tmp = tmp * tmp;
}
return result;
})
COMPLEX_1(float, vc4cl_pow2, int, val, {
// y = 2^x = 1.0 [implied] * 2^(x + offset)
int_t tmp = val << 23;
// alternative: tmp = (val + 127) << 23;
tmp += (int_t) 0x3F800000;
return vc4cl_bitcast_float(tmp & (int_t) 0x7F800000);
})
/**
* Expected behavior:
*
* acos(1) = +-0
* acos(x) = NaN for |x| > 1
*/
COMPLEX_1(float, acos, float, val, {
result_t result = M_PI_2_F - asin(val);
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* acosh(1) = +0
* acosh(x) = NaN for x < 1
* acosh(+Inf) = +Inf
*/
// https://en.wikipedia.org/wiki/Inverse_hyperbolic_functions#Series_expansions
// or: https://en.wikipedia.org/wiki/Hyperbolic_function#Inverse_functions_as_logarithms
// XXX optimize?!
SIMPLE_1(float, acosh, float, val, log(val + sqrt(val * val + 1)))
/**
* Expected behavior:
*
* acospi(1) = +-0
* acospi(x) = NaN for |x| > 1
*/
COMPLEX_1(float, acospi, float, val, {
result_t result = acos(val) * M_1_PI_F;
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* asin(+-0) = +-0
* asin(x) = NaN for |x| > 1
*/
COMPLEX_1(float, asin, float, val, {
result_t result = atan(val *rsqrt(1 - val * val));
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* asinh(+-0) = +-0
* asinh(+-Inf) = +-Inf
*/
// XXX optimize?!
SIMPLE_1(float, asinh, float, val, log(val + sqrt(val * val - 1)))
/**
* Expected behavior:
*
* asinpi(+-0) = +-0
* asinpi(x) = NaN for |x| > 1
*/
COMPLEX_1(float, asinpi, float, val, {
result_t result = asin(val) * M_1_PI_F;
result = fabs(val) > (result_t)1.0f ? nan(0) : result;
return isnan(val) ? val : result;
})
// https://stackoverflow.com/questions/23047978/how-is-arctan-implemented
// https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence (too inaccurate!)
// http://www2.mae.ufl.edu/~uhk/ARCTAN-APPROX-PAPER.pdf (atan(x) = 1/x * F(1/x))
// http://www.jjj.de/fxt/fxtbook.pdf
/**
* Expected behavior:
*
* atan(+-0) = +-0
* atan(+-Inf) = +-pi/2
*
* Argument reduction:
* 1) atan(-x) = -atan(x)
* 2) atan(1/x) = PI/2 - atan(x), x > 0
* 3) atan(1/x) = -PI/2 - atan(x), x < 0
* 4) atan(x) = 2 atan(x / (1 + sqrt(1 + x^2)))
*
* Matters Computational (section 32.1.2.3): Pade Approximation (6 steps)
* Has an error of ~ 0 for |x| <= 1/8 * PI:
* (1155∙x + 1190∙x^3 + 231∙x^5)/(1155 + 1575∙x^2+525∙x^4+25∙x^6)
*
* Alternatively could be calculated via:
* atan(x) = atan2(x, 1)
*/
COMPLEX_1(float, atan, float, val, {
// reduce arguments a few times (r = r/(1+sqrt(1+r^2)))
uint n = 2;
result_t r = val;
for(uint i = 0; i < n; ++i)
r = r / (1 + sqrt(1 + r * r));
// compute the pade approximation
result_t approx = (1155 * r + 1190 * r * r * r + 231 * r * r * r * r * r) /
(1155 + 1575 * r * r + 525 * r * r * r * r + 25 * r * r * r * r * r * r);
// calculate the result (2^n * approx)
result_t result = ((result_t)(2 << (n - 1))) * approx;
// atan(x) with x >= 2^26 ~ atan(Inf) => pi/2
result = (fabs(val) >= (result_t)0x1p+26f) ? copysign(M_PI_2_F, val) : result;
// atan(+-NaN) = +-NaN
return isnan(val) ? val : result;
})
/**
* https://en.wikipedia.org/wiki/Atan2#Definition_and_computation
*
* Expected behavior:
*
* atan2(+-0, -0) = +-pi
* atan2(+-0, +0) = +-0
* atan2(+-0, x) = +-pi for x < 0
* atan2(+-0, x) = +-0 for x > 0
* atan2(y, +-0) = -pi/2 for y < 0
* atan2(y, +-0) = pi/2 for y > 0
* atan2(+-y, -Inf) = +-pi for y > 0 and y != Inf
* atan2(+-y, +Inf) = +-0 for y > 0 and y != Inf
* atan2(+-Inf, x) = +-pi/2 for x != Inf
* atan2(+-Inf, -Inf) = +-3/4pi
* atan2(+-Inf, +Inf) = +-pi/4
*/
COMPLEX_2(float, atan2, float, y, float, x, {
// note: these calculation work since the condition is either 0 or all bits set
result_t tan = atan(y / x);
result_t s = sign(y);
int_t xG0 = x > 0;
int_t xL0 = x < 0;
int_t yEq0 = y == 0;
// arctan(y / x) if x > 0
int_t res0 = xG0 & vc4cl_bitcast_int(tan);
// arctan(y / x) + sign(y) * pi if x < 0, y != 0
int_t res1 = xL0 & ~yEq0 & vc4cl_bitcast_int(tan + s * M_PI_F);
// pi if x < 0, y = 0
int_t res2 = xL0 & yEq0 & vc4cl_bitcast_int(M_PI_F);
// sign(y) * pi/2 if x = 0, y != 0
int_t res3 = (x == 0) & ~yEq0 & vc4cl_bitcast_int(s * M_PI_2_F);
// undefined if x = 0, y = 0
return vc4cl_bitcast_float(res0 | res1 | res2 | res3);
})
/**
* Expected behavior:
*
* atanh(+-0) = +-0
* atanh(+-1) = +-Inf
* atanh(x) = NaN for |x| > 1
*/
SIMPLE_1(float, atanh, float, val, ((result_t) 0.5f) * log((1 + val) / (1 - val)))
/**
* Expected behavior:
*
* atanpi(+-0) = +-0
* atanpi(+-Inf) = +-0.5
*/
SIMPLE_1(float, atanpi, float, val, isnan(val) ? val : atan(val) * M_1_PI_F)
/**
* Expected behavior:
*
* atan2pi(+-0, -0) = +-1
* atan2pi(+-0, +0) = +-0
* atan2pi(+-0, x) = +-1 for x < 0
* atan2pi(+-0, x) = +-0 for x > 0
* atan2pi(y, +-0) = -0.5 for y < 0
* atan2pi(y, +-0) = 0.5 for y > 0
* atan2pi(+-y, -Inf) = +-1 for y > 0
* atan2pi(+-y, +Inf) = +-0 for y > 0
* atan2pi(+-Inf, x) = +-0.5
* atan2pi(+-Inf, -Inf) = +-0.75
* atan2pi(+-Inf, +Inf) = +-0.25
*/
SIMPLE_2(float, atan2pi, float, x, float, y, atan2(x, y) * M_1_PI_F)
/**
* Expected behavior:
*
* cbrt(+-0) = +-0
* cbrt(+-Inf) = +-Inf
*/
// TODO different algorithm?
// e.g. http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt (acbrt1 with a few more Newton steps, but requires several
// floating-point divisions)
SIMPLE_1(float, cbrt, float, val, pow(val, 1.0f / 3.0f))
/**
* Expected behavior:
*
* ceil(+-0) = +-0
* ceil(+-Inf) = +-Inf
* ceil(x) = -0 for -1 < x < 0
*/
COMPLEX_1(float, ceil, float, val, {
//"Round to integer toward + infinity. "
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
// all floating point values |val| > 2^23 are exact numbers
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
// |val| < 2^23 fits into integer
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
// if the truncation is smaller than val (positive numbers), add 1 to round up
result_t ceiling = truncated + (truncated < val ? (result_t)1.0f: (result_t)0.0f);
// fix for ceil([-0.9, -0.0]) = -0.0
result_t result = copysign(ceiling, val);
// fix for negative subnormal values
result = result < val ? result + (result_t) 1.0f : result;
// deciding here which value to return saves us up to two jumps
return tooBig ? val : result;
})
//"Returns x with its sign changed to match the sign of y. "
// sign = y & 0x80000000
// tmp = x & 0x7FFFFFFF
// return tmp | sign
SIMPLE_2(float, copysign, float, x, float, y,
vc4cl_bitcast_float((vc4cl_bitcast_uint(y) & 0x80000000) | (vc4cl_bitcast_uint(x) & 0x7FFFFFFF)))
COMPLEX_1(float, cos_pade, float, val, {
/*
* Pade approximation
* (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 86)
*
* Has a relative error of far less than 2^23 (1 ULP) for [-PI/2, PI/2], then rises fast
*
* Alternative:
* (see alternative for sin_pade)
*/
arg_t a = 45469.0f * val * val * val * val * val * val * val * val;
arg_t b = 7029024.0f * val * val * val * val * val * val;
arg_t c = 348731040.0f * val * val * val * val;
arg_t d = 5269904640.0f * val * val;
arg_t e = 10983772800.0f;
arg_t f = 9336.0f * val * val * val * val * val * val;
arg_t g = 2064720.0f * val * val * val * val;
arg_t h = 221981760.0f * val * val;
arg_t up = a - b + c - d + e;
arg_t down = f + g + h + e;
return up / down;
})
/**
* Expected behavior:
*
* cos(+-0) = 1
* cos(+-Inf) = NaN for |x| > 1
*/
COMPLEX_1(float, cos, float, val, {
/*
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
*
* We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate.
*/
// TODO normalization into [-pi/2, pi/2] is too inaccurate for large values!
// Since cosine has a period of 2pi, these rewrites do not change the result (rounding error excluded):
// bring into range [-2pi, 2pi]
arg_t modifiedVal = fmod(val, 2.0f * M_PI_F);
// bring into range [-pi, pi]
modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal;
modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal;
// We move by half a period, so we must invert the sign of the result
// bring into range [-pi/2, pi/2]
int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F;
modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal;
modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal;
arg_t result = cos_pade(invertSign ? -modifiedVal : modifiedVal);
result = invertSign ? -result : result;
result = isinf(val) ? copysign(nan(0), val) : result;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* cosh(+-0) = 1
* cosh(+-Inf) = +-Inf
*/
COMPLEX_1(float, cosh, float, val, {
/*
* https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence
*
* This version (the Taylor-series up to x^16/16!) has following accuracy:
*
* cosh(PI) -> error of 1.5*10^-7 < ulp(PI) = 2*10^-7
* cosh(0.5f) -> error of < 10^-9 < ulp(0.5f) = 5*10^-8
* cosh(0.0f) -> error of < 10^-46 < ulp(0.0f) = 1*10^-45
*
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
*/
const result_t fac2 = 1.0f / factorial(2);
const result_t fac4 = 1.0f / factorial(4);
const result_t fac6 = 1.0f / factorial(6);
const result_t fac8 = 1.0f / factorial(8);
const result_t fac10 = 1.0f / factorial(10);
const result_t fac12 = 1.0f / factorial(12);
const result_t fac14 = 1.0f / factorial(14);
const result_t fac16 = 1.0f / factorial(16);
/*
* Argument reduction
*
* Matters Computational, section 32.2.3
*/
const uint n = 12;
// r = x / 2^n
const result_t inv2N = 1.0f / (1 << n);
result_t r = val * inv2N;
// C = cosh(r) - 1
result_t x = r;
result_t x2 = x * x;
result_t tmp1 = x2 * fac2;
// cosh(x) = 1 + x^2/2! + ...
result_t tmp = 1 + tmp1;
result_t x4 = x2 * x * x;
tmp1 = x4 * fac4;
//... + x^4/4! + ...
tmp = x + tmp1;
result_t x6 = x4 * x * x;
tmp1 = x6 * fac6;
//... + x^6/6! + ...
tmp = tmp + tmp1;
result_t x8 = x6 * x * x;
tmp1 = x8 + fac8;
//... + x^8/8! + ...
tmp = tmp + tmp1;
result_t x10 = x8 * x * x;
tmp1 = x10 * fac10;
//... + x^10/10! + ...
tmp = tmp + tmp1;
result_t x12 = x10 * x * x;
tmp1 = x12 + fac12;
//... + x^12/12! + ...
tmp = tmp + tmp1;
result_t x14 = x12 * x * x;
tmp1 = x14 * fac14;
//... + x^14/14! + ...
tmp = tmp + tmp1;
result_t x16 = x14 * x * x;
tmp1 = x16 + fac16;
//... + x^16/16!
tmp = tmp + tmp1;
result_t C = tmp - 1;
for(uint i = 0; i < n; ++i)
C = 2 * (C + 1) * (C + 1) - 2;
return C + 1;
})
/**
* Expected behavior:
*
* cospi(+-0) = 1
* cospi(x) = +0 for x = n + 0.5 and n is integer
* cospi(+-Inf) = NaN
*/
SIMPLE_1(float, cospi, float, val, cos(val *M_PI_F))
/**
* Expected behavior:
*
* erfc(-Inf) = 2
* erfc(+Inf) = 0
*/
SIMPLE_1(float, erfc, float, x, 1 - erf(x))
/**
* Expected behavior:
*
* erf(+-0) = +-0
* erf(+-Inf) = +-1
*/
COMPLEX_1(float, erf, float, x, {
/*
* https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
*
* has a maximal error of 1.2 * 10^-7
*
* alternative: http://mathworld.wolfram.com/Erf.html
*/
const result_t t = 1 / (1 + 0.5f * fabs(x));
const result_t r = t *
exp(-x * x - 1.26551223f + 1.00002368f * t + 0.37409196f * t * t + 0.09678418f * t * t * t -
0.18628806f * t * t * t * t + 0.27886807f * t * t * t * t * t - 1.13520398f * t * t * t * t * t * t +
1.48851587f * t * t * t * t * t * t * t - 0.82215223f * t * t * t * t * t * t * t * t +
0.17087277f * t * t * t * t * t * t * t * t * t);
return x < 0 ? r - 1 : 1 - r;
})
/**
* Expected behavior:
*
* exp(+-0) = 1
* exp(-Inf) = 0
* exp(+Inf) = +Inf
*
* Chebyshev interpolation with monomial basis and range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-18
*/
COMPLEX_1(float, exp, float, val, {
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
// Use floor() instead of ceil() to not reach Inf later on for e^(~10^38)
int_t k = vc4cl_ftoi((positive / M_LN2_F) - 0.5f);
arg_t kFloat = vc4cl_itof(k);
// calculate in extended precision to reduce rounding error
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
arg_t pn = 1.143364767943110e-11;
pn = pn * r + 1.632461784798319e-10f;
pn = pn * r + 2.088459690899721e-9f;
pn = pn * r + 2.504861486483735e-8f;
pn = pn * r + 2.755715675968011e-7f;
pn = pn * r + 2.755734045527853e-6f;
pn = pn * r + 2.480158866546844e-5f;
pn = pn * r + 1.984126978734782e-4f;
pn = pn * r + 0.001388888888388f;
pn = pn * r + 0.008333333333342f;
pn = pn * r + 0.041666666666727f;
pn = pn * r + 0.166666666666680f;
pn = pn * r + 0.500000000000002f;
pn = pn * r + 1.000000000000000f;
pn = pn * r + 1.000000000000000f;
pn = pn * vc4cl_pow2(k);
result_t result = val < 0 ? 1 / pn : pn;
result = vc4cl_is_zero(val) ? (result_t)1.0f : result;
result = val <= (result_t)-88.0f ? (result_t)0.0f : result;
result = val >= (result_t)89.0f ? (result_t)INFINITY : result;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* exp2(+-0) = 1
* exp2(-Inf) = 0
* exp2(+Inf) = +Inf
*/
COMPLEX_1(float, exp2, float, val, {
// 2^x = e^(x * ln(2))
result_t result = exp(val * M_LN2_F);
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* exp10(+-0) = 1
* exp10(-Inf) = 0
* exp10(+Inf) = +Inf
*/
COMPLEX_1(float, exp10, float, val, {
// 10^x = e^(x / log10(e))
result_t result = exp(val * M_1_LOG10_E_F);
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* expm1(+-0) = +-0
* expm1(-Inf) = -1
* expm1(+Inf) = +Inf
*/
// e^x - 1.0
COMPLEX_1(float, expm1, float, val, {
result_t result = exp(val) - 1.0f;
return isnan(val) ? val : result;
})
/**
* Expected behavior:
*
* fabs(+-0) = +0
* fabs(+-Inf) = +Inf
*/
SIMPLE_1(float, fabs, float, val, vc4cl_fmaxabs(val, 0.0f))
/**
* Expected behavior:
*
* fdim(x, NaN) = NaN
* fdim(NaN, y) = NaN
*/
//"x-y if x>y, +0 if x is less than or equal to y."
COMPLEX_2(float, fdim, float, x, float, y, {
result_t result = x > y ? x - y : 0.0f;
result = isnan(x) ? x : result;
return isnan(y) ? y : result;
})
/**
* Expected behavior:
*
* floor(+-0) = +-0
* floor(+-Inf) = +-Inf
*/
COMPLEX_1(float, floor, float, val, {
//" Round to integer toward negative infinity. "
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
// all floating point values |val| > 2^23 are exact numbers
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
// |val| < 2^23 fits into integer
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
// if the truncation is greater than val (negative numbers), subtract 1 to round down
result_t floor_val = truncated - (truncated > val ? (result_t)1.0f : (result_t)0.0f);
// deciding here which value to return saves us up to two jumps
return tooBig ? val : floor_val;
})
/**
* Expected behavior:
*
* fma(x, y, z) = xy + z
* fma(x, y, z) = NaN for x = 0 or y = 0
* fma(x, y, z)= NaN for x = Inf, y = Inf, z = Inf
*/
// TODO not "infinitely precise product" and maybe not "correctly rounded"
SIMPLE_3(float, fma, float, a, float, b, float, c, (a * b) + c)
// The fmax/min ALU operations (vc4cl_fmax/vc4cl_fmin) handle NaNs as bigger then all, we need to handle them as:
// " If one argument is a NaN, fmin/fmax() returns the other argument. If both arguments are NaNs, fmin/fmax() returns a NaN."
SIMPLE_2(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y))
SIMPLE_2_SCALAR(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y))
SIMPLE_2(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y))
SIMPLE_2_SCALAR(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y))
/**
* Expected behavior:
*
* fmod(+-0, y) = +-0 for y != 0
* fmod(x, y) = NaN for x = +-Inf or y = 0
* fmod(x, +-Inf) = x
* fmod(+-0, NaN) = NaN
*/
//"Modulus. Returns x - y * trunc(x/y)"
COMPLEX_2(float, fmod, float, x, float, y, {
result_t result = x - y * trunc(x / y);
result = (isinf(x) || sign(y) == (result_t)0.0f) ? (result_t)nan(0) : result;
result = (isinf(y) || isnan(x)) ? x : result;
return isnan(y) ? y : result;
})
/**
* Expected behavior:
*
* 0 <= fract(x, iptr) < 1
* fract(+0, iptr) = +0, iptr = +0
* fract(-0, iptr) = -0, iptr = -0
* fract(+Inf, iptr) = +0, iptr = +Inf
* fract(-Inf, iptr) = -0, iptr = -Inf
* fract(NaN, iptr) = NaN, iptr = NaN
*/
COMPLEX_2(float, fract, float, val, __global float, *iptr, {
//"Returns fmin( x floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
result_t tmp = floor(val);
*iptr = tmp;
tmp = fmin(val - tmp, 0x1.fffffep-1f);
return isnan(val) ? val : tmp;
})
COMPLEX_2(float, fract, float, val, __local float, *iptr, {
//"Returns fmin( x floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
result_t tmp = floor(val);
*iptr = tmp;
tmp = fmin(val - tmp, 0x1.fffffep-1f);
return isnan(val) ? val : tmp;
})
COMPLEX_2(float, fract, float, val, __private float, *iptr, {
//"Returns fmin( x floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr."
result_t tmp = floor(val);
*iptr = tmp;
tmp = fmin(val - tmp, 0x1.fffffep-1f);
return isnan(val) ? val : tmp;
})
/**
* Expected behavior:
*
* frexp(+-0, exp) = +-0, exp = 0
* frexp(+-Inf, exp) = +-Inf, exp = 0
* frexp(NaN, exp) = NaN, exp = 0
*/
COMPLEX_2(float, frexp, float, x, __global int, *exp, {
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
// we need to support subnormals here, since they are only used as input, not output
int_t mantissa = absBits & (int_t) 0x007FFFFF;
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
// the next lines are required to fix-up the mantissa for our added subnormal exponent
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = mantissa | (int_t)0x3F000000;
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
*exp = specialCase ? (int_t)0 : exponent;
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
})
COMPLEX_2(float, frexp, float, x, __local int, *exp, {
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
// we need to support subnormals here, since they are only used as input, not output
int_t mantissa = absBits & (int_t) 0x007FFFFF;
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
// the next lines are required to fix-up the mantissa for our added subnormal exponent
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = mantissa | (int_t)0x3F000000;
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
*exp = specialCase ? (int_t)0 : exponent;
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
})
COMPLEX_2(float, frexp, float, x, __private int, *exp, {
// Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c
int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF;
int_t exponent = ((absBits >> 23) & 0xFF) - 126;
// we need to support subnormals here, since they are only used as input, not output
int_t mantissa = absBits & (int_t) 0x007FFFFF;
int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23));
// the next lines are required to fix-up the mantissa for our added subnormal exponent
int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset);
mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0);
exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = mantissa | (int_t)0x3F000000;
int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x);
*exp = specialCase ? (int_t)0 : exponent;
return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x);
})
/**
* Expected behavior:
*
* hypot(x, y) = hypot(y, x) = hypot(x, -y)
* hypot(x, +-0) = fabs(x)
* hypot(+-Inf, y) = +Inf
*/
SIMPLE_2(float, hypot, float, x, float, y, sqrt(x *x + y * y))
/**
* Expected behavior (C99 standard):
*
* ilogb(0) = FP_ILOGB0
* ilogb(+-Inf) = INT_MAX
* ilogb(+-NaN) = FP_ILOGBNAN
*/
COMPLEX_1(int, ilogb, float, x, {
//"Return the exponent as an integer value."
// https://en.wikipedia.org/wiki/Single-precision_floating-point_format
result_t bitcast = vc4cl_bitcast_int(x);
// deduct exponent offset
int_t result = ((bitcast >> 23) & 0xFF) - 127;
// we need to support subnormals here, since they are only used as input, not output
int_t mantissa = bitcast & (int_t) 0x007FFFFF;
result = result == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : result;
result = vc4cl_is_zero(x) ? (result_t)FP_ILOGB0 : result;
result = isinf(x) ? (result_t)INT_MAX : result;
return isnan(x) ? (result_t)FP_ILOGBNAN : result;
})
/**
* Expected behavior:
*
* ldexp(+-0, n) = +-0
* ldexp(x, 0) = x
* ldexp(+-Inf, n) = +-Inf
*/
//"Multiply x by 2 to the power k."
COMPLEX_2(float, ldexp, float, x, int, k, {
// Taken from MESA (https://gitlab.freedesktop.org/mesa/mesa/-/blob/master/src/compiler/nir/nir_opt_algebraic.py)
// Alternatively in https://git.musl-libc.org/cgit/musl/tree/src/math/scalbnf.c
int_t exp = min(max(k, -254), 254);
int_t pow2_1 = ((exp >> 1) + 127) << 23;
int_t pow2_2 = ((exp - (exp >> 1)) + 127) << 23;
result_t result = x * vc4cl_bitcast_float(pow2_1) * vc4cl_bitcast_float(pow2_2);
return vc4cl_is_inf_nan(x) ? x : result;
})
SIMPLE_2_SCALAR(float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof(1 << k) : 1.0f / vc4cl_itof(1 << -k)))
/**
* Expected behavior:
*
* lgamma(1) = 0
* lgamma(2) = 0
* lgamma(x) = +Inf for x <= 0 and x integer
* lgamma(-Inf) = +Inf
* lgamma(+Inf) = +Inf
*/
COMPLEX_1(float, lgamma, float, val, {
//"Returns the natural logarithm of the absolute value of the gamma function"
// see Numerical Recipes, chapter 6.1
// has error of < 2 * 10^-10
// XXX only for val > 0
// TODO far too inaccurate/wrong?
result_t x;
result_t y;
result_t tmp;
result_t ser;
y = x = val;
tmp = x + 5.5f;
tmp -= (x + 0.5f) * log(tmp);
ser = 1.000000000190015f;
y += 1;
ser += 76.18009172947146f / y;
y += 1;
ser += -86.50532032941677f / y;
y += 1;
ser += 24.01409824083091f / y;
y += 1;
ser += -1.231739572450155f / y;
y += 1;
ser += 0.1208650973866179e-2f / y;
y += 1;
ser += -0.5395239384953e-5f / y;
return -tmp + log((2.5066282746310005f) * ser / x);
})
/**
* Expected behavior:
*
* lgamma_r(x, signp) -> signp = 0 for x < 0
*/
COMPLEX_2(float, lgamma_r, float, x, __global int, *signp, {
// TODO better way?
result_t tmp = lgamma(x);
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
return lgamma(x < 0.0f ? (-1.0f * x) : x);
})
COMPLEX_2(float, lgamma_r, float, x, __local int, *signp, {
// TODO better way?
result_t tmp = lgamma(x);
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
return lgamma(x < 0.0f ? (-1.0f * x) : x);
})
COMPLEX_2(float, lgamma_r, float, x, __private int, *signp, {
// TODO better way?
result_t tmp = lgamma(x);
*signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1;
return lgamma(x < 0.0f ? (-1.0f * x) : x);
})
COMPLEX_1(float, log1p_pade, float, val, {
/*
* Calculates ln(1 + x) via Pade approximation
* (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 72/96)
*
* Has a relative error of less than 2*2^23 (2 ULP) for range [-0.5, 2], then rises fast
*/
// g(x) = (49x^6+1218x^5+7980x^4+20720x^3+23100x^2+9240x)/(10x^6+420x^5+4200x^4+16800x^3+31500x^2+27720x+9240)
arg_t a = 49.0f * val * val * val * val * val * val;
arg_t b = 1218.0f * val * val * val * val * val;
arg_t c = 7980.0f * val * val * val * val;
arg_t d = 20720.0f * val * val * val;
arg_t e = 23100.0f * val * val;
arg_t f = 9240.0f * val;
arg_t g = 10.0f * val * val * val * val * val * val;
arg_t h = 420.0f * val * val * val * val * val;
arg_t i = 4200.0f * val * val * val * val;
arg_t k = 16800.0f * val * val * val;
arg_t l = 31500.0f * val * val;
arg_t m = 27720.0f * val;
arg_t n = 9240.0f;
arg_t up = a + b + c + d + e + f;
arg_t down = g + h + i + k + l + m + n;
return up / down;
})
/**
* Expected behavior:
*
* log2(+-0) = -Inf
* log2(1) = 0
* log2(x) = Nan for x < 0
* log2(+Inf) = +Inf
*/
COMPLEX_1(float, log2, float, val, {
/* log2(x) = log2(M * e^E) = E + log2(M) */
int_t bitcast = vc4cl_bitcast_int(val);
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
arg_t mantissa = vc4cl_bitcast_float(tmp);
/*
* Alternatively could use SFU and Newton-Raphson steps:
* f(x) = log2(x) - y
* root: f(x) = 0 -> log2(x) = y
* f'(x) = 1/(x * ln(2))
* xn+1 = xn - (log2(xn) - y) / (1/(xn * ln(2)))
* xn+1 = xn - xn * ln(2) * (log2(x) - y)
* => but don't know y which is exact log2(x)!!!
*
* Alternatively could approximate root of: f(y) = 2^y - x, need exact exp2(x)
*/
// result_t logMantissa = vc4cl_sfu_log2(mantissa);
// log2(M*2^E) = E + log2(M) = E + ln(M) / ln(2) = E + ln1p(M - 1) / ln(2)
result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN2_F;
result_t result = vc4cl_itof(exponent) + logMantissa;
result = vc4cl_is_inf_nan(val) ? val : result;
result = signbit(val) ? (result_t)nan(0) : result;
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
return result;
})
/**
* Expected behavior:
*
* log(+-0) = -Inf
* log(1) = 0
* log(x) = Nan for x < 0
* log(+Inf) = +Inf
*/
COMPLEX_1(float, log, float, val, {
/*
* Other sources/calculations:
* - Matters computational (sections 32.1.1 and 32.1.3)
* - https://math.stackexchange.com/questions/977586/is-there-an-approximation-to-the-natural-log-function-at-large-values
* - https://math.stackexchange.com/questions/1382070/iterative-calculation-of-log-x
* - https://evoq-eval.siam.org/Portals/0/Publications/SIURO/Vol7/New_Method_for_Approximating_Logarithms.pdf?ver=2018-04-06-151916-493
* - https://cs.stackexchange.com/questions/91185/compute-ex-given-starting-approximation
* - https://shoueiko.wordpress.com/2017/05/01/approximation-of-log-2/
* - https://math.stackexchange.com/questions/3381629/what-is-the-fastest-algorithm-for-finding-the-natural-logarithm-of-a-big-number
* - https://stackoverflow.com/questions/39821367/very-fast-approximate-logarithm-natural-log-function-in-c
*/
/* log(M * 2^E) = log(M) + E log(2) */
int_t bitcast = vc4cl_bitcast_int(val);
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
arg_t mantissa = vc4cl_bitcast_float(tmp);
result_t logMantissa = log1p_pade(mantissa - 1.0f);
result_t result = vc4cl_itof(exponent) * M_LN2_F + logMantissa;
result = vc4cl_is_inf_nan(val) ? val : result;
result = signbit(val) ? (result_t)nan(0) : result;
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
return result;
})
/**
* Expected behavior:
*
* log10(+-0) = -Inf
* log10(1) = 0
* log10(x) = Nan for x < 0
* log10(+Inf) = +Inf
*/
COMPLEX_1(float, log10, float, val, {
/* log10(M * 2^E) = log10(M) + E log10(2) */
int_t bitcast = vc4cl_bitcast_int(val);
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
arg_t mantissa = vc4cl_bitcast_float(tmp);
/* log10(M) = log(M) / log(10) */
result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN10_F;
const arg_t log2_10 = M_LN2_F / M_LN10_F;
result_t result = vc4cl_itof(exponent) * log2_10 + logMantissa;
result = vc4cl_is_inf_nan(val) ? val : result;
result = signbit(val) ? (result_t)nan(0) : result;
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
return result;
})
/**
* Expected behavior:
*
* log1p(+-0) = +-0
* log1p(-1) = -Inf
* log1p(x) = Nan for x < -1
* log1p(+Inf) = +Inf
*/
//TODO directly use log1p_pade(), needs argument reduction
COMPLEX_1(float, log1p, float, val, {
result_t result = log(1.0f + val);
result = vc4cl_is_zero(val) ? val : result;
result = val == (arg_t)-1.0f ? (result_t)-INFINITY : result;
result = val < (arg_t)-1.0f ? (result_t)nan(0) : result;
result = vc4cl_is_inf_nan(val) ? val : result;
return result;
})
/**
* Expected behavior:
*
* logb(+-0) = -Inf
* logb(+-Inf) = +Inf
*/
COMPLEX_1(float, logb, float, x, {
int_t bitcast = vc4cl_bitcast_int(x);
// deduct exponent offset
int_t exponent = ((bitcast >> 23) & 0xFF) - 127;
// we need to support subnormals here, since they are only used as input, not output
int_t mantissa = bitcast & (int_t) 0x007FFFFF;
exponent = exponent == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : exponent;
result_t result = vc4cl_itof(exponent);
result = vc4cl_is_zero(x) ? (result_t)-INFINITY : result;
result = isinf(x) ? fabs(x) : result;
return isnan(x) ? x : result;
})
SIMPLE_3(float, mad, float, a, float, b, float, c, (a * b) + c)
//"Returns x if |x|>|y|, y if |y|>|x|, otherwise fmax(x, y)"
COMPLEX_2(float, maxmag, float, x, float, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t tmp = fmax(x, y);
result_t other = fabs(y) > fabs(x) ? y : tmp;
return fabs(x) > fabs(y) ? x : other;
})
//"Returns x if |x|<|y|, y if |y|<|x|, otherwise fmin(x, y)"
COMPLEX_2(float, minmag, float, x, float, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t tmp = fmin(x, y);
result_t other = fabs(y) < fabs(x) ? y : tmp;
return fabs(x) < fabs(y) ? x : other;
})
/**
* Expected behavior:
*
* modf(+-x, iptr) = value with same sign as x
* modf(+-Inf, iptr) = +-0, iptr = +-Inf
* modf(NaN, iptr) = NaN, iptr = NaN
*/
// taken OpenCL C 1.2 specification, 7.5.2. Changes to C99 TC2 Behavior
COMPLEX_2(float, modf, float, value, __global float, *iptr, {
result_t tmp = trunc(value);
*iptr = isnan(value) ? value : tmp;
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
return isnan(value) ? value : result;
})
COMPLEX_2(float, modf, float, value, __local float, *iptr, {
result_t tmp = trunc(value);
*iptr = isnan(value) ? value : tmp;
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
return isnan(value) ? value : result;
})
COMPLEX_2(float, modf, float, value, __private float, *iptr, {
result_t tmp = trunc(value);
*iptr = isnan(value) ? value : tmp;
result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value);
return isnan(value) ? value : result;
})
SIMPLE_1(float, nan, uint, nancode, vc4cl_bitcast_float(NAN | nancode))
/**
* Expected behavior:
*
* nextafter(+smallest normal, y < +smallest normal) = +0
* nextafter(-smallest normal, y > -smallest normal) = -0
* nextafter(-0, y > 0) = smallest positive normal value
* nextafter(0, y < 0) = smallest negative normal value
*/
COMPLEX_2(float, nextafter, float, x, float, y, {
int_t ix = vc4cl_bitcast_int(x);
int_t iy = vc4cl_bitcast_int(y);
/* x > y -> x -= ulp otherwise x += ulp */
int_t xPos = ix > iy ? ix - 1 : ix + 1;
/* x < y -> x -= ulp otherwise x += ulp */
int_t xNeg = iy >= 0 || ix > iy ? ix - 1 : ix + 1;
int_t xNotY = ix >= 0 ? /* x > 0 */ xPos : /* x < 0 */ xNeg;
int_t res = x == y ? iy : xNotY;
result_t result = vc4cl_bitcast_float(res);
result = (x == (result_t)-0.0f && y > 0.0f) ? (result_t)FLT_MIN : result;
result = (x == (result_t)0.0f && y < 0.0f) ? (result_t)-FLT_MIN : result;
result = (x == (result_t)FLT_MIN && y < (result_t)FLT_MIN) ? (result) +0.0f : result;
result = (x == (result_t)-FLT_MIN && y > (result_t)-FLT_MIN) ? (result) -0.0f : result;
return isnan(x) ? x : (isnan(y) ? y : result);
})
/**
* Expected behavior:
*
* pow(+-0, y) = +-Inf for y odd and y < 0
* pow(+-0, y) = +Inf for y even and y < 0
* pow(+-0, y) = +-0 for y odd and y > 0
* pow(+-0, y) = 0 for y even and y > 0
* pow(-1, +-Inf) = 1
* pow(1, y) = 1
* pow(x, +-0) = 1
* pow(x, y) = NaN for x < 0 and y not an integer
* pow(x, -Inf) = +Inf for |x| < 0
* pow(x, -Inf) = 0 for |x| > 0
* pow(x, +Inf) = 0 for |x| < 0
* pow(x, +Inf) = 0 for |x| > 0
* pow(-Inf, y) = -0 for y odd and y < 0
* pow(-Inf, y) = 0 for y even and y < 0
* pow(-Inf, y) = -Inf for y odd and y > 0
* pow(-Inf, y) = +Inf for y even and y > 0
* pow(+Inf, y) = 0 for y < 0
* pow(+Inf, y) = +Inf for y > 0
* pow(+-0, -Inf) =+Inf
*/
// for pow, see also https://stackoverflow.com/questions/4518011/algorithm-for-powfloat-float
COMPLEX_2(float, pow, float, x, float, y, {
result_t tmp = powr(x, y);
return y < 0.0f ? (result_t) 1.0f / tmp : tmp;
})
/**
* Expected behavior:
*
* pown(x, 0) = 1
* pown(+-0, n) = +-Inf for odd n and n < 0
* pown(+-0, n) = +Inf for even n and n < 0
* pown(+-0, n) = +-0 for odd n and n > 0
* pown(+-0, n) = 0 for even n and n > 0
*/
// TODO is there a way to not need to calculate both versions? Or at least calculate both at once?!
SIMPLE_2(float, pown, float, x, int, n,
n < 0 ? (1.0f / fast_pown(x, vc4cl_bitcast_uint(-n), 32)) : fast_pown(x, vc4cl_bitcast_uint(n), 32))
/**
* Expected behavior:
*
* powr(x, +-0) = 1 for x > 0
* powr(+-0, y) = +Inf for y < 0
* powr(+-0, -Inf) = +Inf
* powr(+-0, y) = +0 for y > 0
* powr(1, y) = 1
* powr(x, y) = NaN for x < 0
* powr(+-0, +-0) = NaN
* powr(+Inf, +-0) = NaN
* powr(1, +-Inf) = NaN
* powr(x, NaN) = NaN for x >= 0
* powr(NaN, y) = NaN
*/
//"Compute x to the power y, where x is >= 0."
// x^y = e^(y * ln(x))
SIMPLE_2(float, powr, float, x, float, y, exp(y *log(x)))
/**
* Expected behavior:
*
* remainder(x, 0) = NaN or 0
* remainder(Inf, y) = NaN
* remainder(x, Inf) = x
*/
COMPLEX_2(float, remainder, float, x, float, y, {
// TODO correct?
result_t k = rint(x / y);
result_t result = x - k * y;
result = vc4cl_is_zero(y) ? (result_t)-nan(0) : result;
result = isinf(x) ? (result_t)nan(0) : result;
result = isnan(x) ? x : result;
return isnan(y) ? y : result;
})
/**
* Expected behavior:
*
* remquo(x, y, quo) = NaN, quo = 0 for x = +-Inf or y = 0 or x = NaN or y = NaN
* exp2(-Inf) = 0
* exp2(+Inf) = +Inf
*/
// taken from pocl (https://github.com/pocl/pocl/blob/master/lib/kernel/vecmathlib-pocl/remquo.cl)
COMPLEX_3(float, remquo, float, x, float, y, __global int, *quo, {
result_t k = rint(x / y);
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
return x - k * y;
})
COMPLEX_3(float, remquo, float, x, float, y, __local int, *quo, {
result_t k = rint(x / y);
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
return x - k * y;
})
COMPLEX_3(float, remquo, float, x, float, y, __private int, *quo, {
result_t k = rint(x / y);
*quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k));
return x - k * y;
})
/**
* Expected behavior:
*
* rint(x) = -0 for -0.5 <= x < 0
*/
COMPLEX_1(float, rint, float, val, {
//" Round to nearest even integer. "
// round like round, but decides on nearest even for halfway cases
// TODO always round to nearest even (e.g. 0.4 to 2.0) or just for half-way cases?
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
// |val| < 2^23 fits into integer
int_t integer = vc4cl_ftoi(val);
result_t truncated = vc4cl_itof(integer);
// for negative numbers, truncated is larger than value, for positive it is smaller
result_t diff = val - truncated;
// -> if the |difference| is > 0.5, subtract/add 1
result_t correction = 0.0f + (diff < -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff > 0.5f ? (result_t)1.0f : (result_t)0.0f);
// if the value is exactly the half, round to nearest even
int_t isHalfWay = diff == 0.5f || diff == -0.5f;
// if the truncated integer is even, use it. Otherwise (lsb of integer is 1) subtract/add 1
result_t halfWayCorrection = 0 + vc4cl_itof(integer & 1) * sign(val);
result_t result = truncated + correction + vc4cl_itof(isHalfWay) * halfWayCorrection;
// deciding here which value to return saves us up to two jumps
return tooBig ? val : result;
})
/**
* Expected behavior:
*
* rootn(+-0, n) = +-Inf for odd n < 0
* rootn(+-0, n) = +Inf for even n < 0
* rootn(+-0, n) = +-0 for odd n > 0
* rootn(+-0, n) = +0 for even n > 0
* rootn(x, n) = NaN for x < 0 and n even
* rootn(x, 0) = NaN
*/
// TODO different algorithm?
SIMPLE_2(float, rootn, float, x, int, y, pow(x, (arg0_t) 1.0f / vc4cl_itof(y)))
// see: https://en.wikipedia.org/wiki/Nth_root_algorithm
/**
* Expected behavior:
*
* round(+-0) = +-0
* round(+-Inf) = +-Inf
* round(x) = -0 for -0.5 < x < 0
*/
COMPLEX_1(float, round, float, val, {
//" Return the integral value nearest to x rounding halfway cases away from zero, regardless of the current rounding
// direction. "
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
// |val| < 2^23 fits into integer
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
// for negative numbers, truncated is larger than value, for positive it is smaller
result_t diff = val - truncated;
// -> if the |difference| is >= 0.5, subtract/add 1
result_t correction = 0.0f + (diff <= -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff >= 0.5f ? (result_t)1.0f : (result_t)0.0f);
result_t result = truncated + correction;
// deciding here which value to return saves us up to two jumps
return tooBig ? val : result;
})
COMPLEX_1(float, rsqrt, float, x, {
// Using Goldschmidt's algorithm with 2 iterations
// see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9)
arg_t y = vc4cl_sfu_rsqrt(x);
arg_t g = x * y;
arg_t h = y * (arg_t)0.5f;
for(unsigned i = 0; i < 2; ++i) {
arg_t r = (arg_t)0.5f - g * h;
g = g + g * r;
h = h + h * r;
}
// rsqrt(+-0) = +-Inf
result_t result = vc4cl_is_zero(x) ? copysign((result_t)INFINITY, x) : (arg_t)2.0f * h;
// rsqrt(NaN) = NaN
result = isnan(x) ? x : result;
// rsqrt(Inf) = 0
result = isinf(x) ? (result_t)0.0f : result;
// rsqrt(x) for x < 0 = -NaN
result = x < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result;
return result;
})
COMPLEX_1(float, sin_pade, float, val, {
/*
* Pade approximation
* (https://en.wikipedia.org/wiki/Pad%C3%A9_approximant#Examples)
*
* Has a relative error of less than 2^23 (1 ULP) for range [-PI/2, PI/2], then rises fast
*
* Alternative:
* Mendenhall algorithm to calculate both sine and cosine:
* - Is also exact enough for range [-PI/2, PI/2]
* - https://arxiv.org/pdf/cs/0406049.pdf
* -
* https://github.com/broadcomCM/android_hardware_broadcom_brcm_usrlib/blob/ics/dag/vmcsx/middleware/khronos/glsl/glsl_mendenhall.c
*/
arg_t a = 12671.0f / 4363920.0f;
arg_t b = 2363.0f / 18183.0f;
arg_t c = 445.0f / 12122.0f;
arg_t d = 601.0f / 872784.0f;
arg_t e = 121.0f / 16662240.0f;
arg_t up = (a * val * val * val * val * val) - (b * val * val * val) + val;
arg_t down = 1.0f + (c * val * val) + (d * val * val * val * val) + (e * val * val * val * val * val * val);
return up / down;
})
COMPLEX_1(float, sin_fast, float, x, {
/*
* Alternative implementation, used by Mesa3D
* - less instructions to calculate
* - has greater error (even for range -pi/2, pi/2)
* https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/compiler/nir/nir_opt_algebraic.py
* https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
*/
const arg_t B = 4.0 / M_PI;
const arg_t C = -4.0 / (M_PI * M_PI);
arg_t y = B * x + C * x * fabs(x);
// const float Q = 0.775;
const arg_t P = (arg_t)0.225;
y = P * (y * fabs(y) - y) + y; // Q * y + P * y * abs(y)
return y;
})
//COMPLEX_1(float, sin_taylor, float, x, {
/*
* Alternative implementation, used by Mesa3D VC4 driver
* - calculates the Taylor series
* -
* https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/gallium/drivers/vc4/vc4_program.c (function ntq_fsin)
*/
//})
/**
* Expected behavior:
*
* sin(+-0) = +-0
* sin(+-Inf) = NaN for |x| > 1
*/
COMPLEX_1(float, sin, float, val, {
/*
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
*
* We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate.
*/
// TODO normalization into [-pi/2, pi/2] is too inaccurate for large values! fmod too inaccurate (~8 ULP, should be 0)
// Since sine has a period of 2pi, these rewrites do not change the result (rounding error excluded):
// bring into range [-2pi, 2pi]
arg_t modifiedVal = fmod(val, 2.0f * M_PI_F);
// bring into range [-pi, pi]
modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal;
modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal;
// We move by half a period, so we must invert the sign of the result
// bring into range [-pi/2, pi/2]
int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F;
modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal;
modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal;
arg_t result = sin_pade(modifiedVal);
return invertSign ? -result : result;
})
SIMPLE_2(float, sincos, float, x, __global float, *cosval, (*cosval = cos(x), sin(x)))
SIMPLE_2(float, sincos, float, x, __local float, *cosval, (*cosval = cos(x), sin(x)))
SIMPLE_2(float, sincos, float, x, __private float, *cosval, (*cosval = cos(x), sin(x)))
/**
* Expected behavior:
*
* sinh(+-0) = +-0
* sinh(+-Inf) = +-Inf
*/
COMPLEX_1(float, sinh, float, val, {
/*
* https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence
*
* This version (the Taylor-series up to x^17/17!) has following accuracy:
*
* sinh(PI) -> error of 2*10^-8 < ulp(PI) = 2*10^-7
* sinh(0.5f) -> error of < 10^-10 < ulp(0.5f) = 5*10^-8
* sinh(0.0f) -> error of < 10^-48 < ulp(0.0f) = 1*10^-45
*
* OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp
*/
// TODO inaccurate out of the range of -PI <= x <= PI
/*
* Alternatives:
* sinh(x) = sqrt((cosh(2x)-1)/2)
* sinh(x) = exp(x) - cosh(x)
*/
const result_t fac3 = 1.0f / factorial(3);
const result_t fac5 = 1.0f / factorial(5);
const result_t fac7 = 1.0f / factorial(7);
const result_t fac9 = 1.0f / factorial(9);
const result_t fac11 = 1.0f / factorial(11);
const result_t fac13 = 1.0f / factorial(13);
const result_t fac15 = 1.0f / factorial(15);
const result_t fac17 = 1.0f / factorial(17);
result_t x = val;
/// TODO optimize!!
/*
while(x < -M_PI_F)
{
x = x + 2 * M_PI_F;
}
while(x > M_PI_F)
{
x = x - 2 * M_PI_F;
}
*/
result_t x3 = x * x * x;
result_t tmp1 = x3 * fac3;
// sinh(x) = x + x^3/3! + ...
result_t tmp = x + tmp1;
result_t x5 = x3 * x * x;
tmp1 = x5 * fac5;
//... + x^5/5! + ...
tmp = x + tmp1;
result_t x7 = x5 * x * x;
tmp1 = x7 * fac7;
//... + x^7/7! + ...
tmp = tmp + tmp1;
result_t x9 = x7 * x * x;
tmp1 = x9 * fac9;
//... + x^9/9! + ...
tmp = tmp + tmp1;
result_t x11 = x9 * x * x;
tmp1 = x11 * fac11;
//... + x^11/11! + ...
tmp = tmp + tmp1;
result_t x13 = x11 * x * x;
tmp1 = x13 * fac13;
//... + x^13/13! + ...
tmp = tmp + tmp1;
result_t x15 = x13 * x * x;
tmp1 = x15 * fac15;
//... + x^15/15! + ...
tmp = tmp + tmp1;
result_t x17 = x15 * x * x;
tmp1 = x17 * fac17;
//... + x^17/17!
tmp = tmp + tmp1;
return tmp;
})
/**
* Expected behavior:
*
* sinpi(+-0) = +-0
* sinpi(n) = 0 for n integer and n > 0
* sinpi(n) = -0 for n integer and n < 0
* sinpi(+-Inf) = NaN
*/
SIMPLE_1(float, sinpi, float, val, sin(val *M_PI_F))
COMPLEX_1(float, sqrt, float, x, {
// Using Goldschmidt's algorithm with 2 iterations
// see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9)
arg_t y = vc4cl_sfu_rsqrt(x);
arg_t g = x * y;
arg_t h = y * (arg_t)0.5f;
for(unsigned i = 0; i < 2; ++i) {
arg_t r = (arg_t)0.5f - g * h;
g = g + g * r;
h = h + h * r;
}
// sqrt(NaN) = NaN, sqrt(Inf) = Inf
result_t result = vc4cl_is_inf_nan(x) ? x : g;
// sqrt(+-0) = +-0
result = vc4cl_is_zero(x) ? x : result;
// sqrt(x) = NaN, for x < 0
return x < 0.0f ? (result_t)nan(0) : result;
})
/**
* Expected behavior:
*
* tan(+-0) = +-0
* tan(+-Inf) = NaN
*/
// TODO different algorithm?!
// Taylor Series is too inaccurate (converges too slow)
SIMPLE_1(float, tan, float, val, sin(val) / cos(val))
/**
* Expected behavior:
*
* tanh(+-0) = +-0
* tanh(+-Inf) = +-Inf
*/
// TODO different algorithm?!
// Taylor Series is too inaccurate (converges too slow)
SIMPLE_1(float, tanh, float, val, sinh(val) / cosh(val))
/**
* Expected behavior:
*
* tanpi(+-0) = +-0
* tanpi(+-Inf) = NaN
* tanpi(n) = copysign(0,n) for n even and n integer
* tanpi(n) = copysign(0, -n) for n odd and n integer
* tanpi(x) = +Inf for x = n + 0.5 and n even integer
* tanpi(x) = -Inf for x = n + 0.5 and n odd integer
*/
SIMPLE_1(float, tanpi, float, val, tan(val *M_PI_F))
/**
* Expected behavior:
*
* tgamma(+-0) = +-Inf
* tgamma(x) = NaN for x < 0 and x integer
* tgamma(-Inf) = NaN
* tgamma(+Inf) = +Inf
*/
// TODO different algorithm?! E.g. http://www.math.hkbu.edu.hk/support/aands/page_257.htm with argument reduction
// (previous page)
SIMPLE_1(float, tgamma, float, val, exp(lgamma(val)))
/**
* Expected behavior:
*
* trunc(+-0) = +-0
* trunc(+-Inf) = +-Inf
* trunc(x) = -0 for -1 < x < 0
*/
COMPLEX_1(float, trunc, float, val, {
//" Round to integer toward zero. "
// https://stackoverflow.com/questions/12279914/implement-ceil-in-c
// http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1
int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f;
// |val| < 2^23 fits into integer
result_t truncated = vc4cl_itof(vc4cl_ftoi(val));
// deciding here which value to return saves us up to two jumps
return tooBig ? val : truncated;
})
/*
* native functions
*/
SIMPLE_1(float, native_cos, float, val, cos(val))
SIMPLE_2(float, native_divide, float, x, float, y, x *native_recip(y))
// e^x = 2^(x * log2(e))
SIMPLE_1(float, native_exp, float, x, native_exp2(x *M_LOG2E_F))
SIMPLE_1(float, native_exp2, float, val, vc4cl_sfu_exp2(val))
// 10^x = 2^(x * log2(10))
SIMPLE_1(float, native_exp10, float, x, native_exp2(x *M_LOG210))
// log(x) = log2(x) / log2(e)
SIMPLE_1(float, native_log, float, x, native_log2(x) * (1.0f / M_LOG2E_F))
SIMPLE_1(float, native_log2, float, val, vc4cl_sfu_log2(val))
// log10(x) = log2(x) / log2(10)
SIMPLE_1(float, native_log10, float, x, native_log2(x) * (1.0f / M_LOG210))
//"Compute x to the power y, where x is >= 0."
// x^y = 2^(y * log2(x))
SIMPLE_2(float, native_powr, float, x, float, y, native_exp2(y *native_log2(x)))
SIMPLE_1(float, native_recip, float, val, vc4cl_sfu_recip(val))
SIMPLE_1(float, native_rsqrt, float, val, vc4cl_sfu_rsqrt(val))
SIMPLE_1(float, native_sin, float, val, sin(val))
SIMPLE_1(float, native_sqrt, float, val, val * vc4cl_sfu_rsqrt(val))
SIMPLE_1(float, native_tan, float, val, tan(val))
/*
* half functions, accuracy of 8192 ULP (11 MSB of the mantissa)
*/
SIMPLE_1(float, half_cos, float, val, cos(val))
SIMPLE_2(float, half_divide, float, x, float, y, x * vc4cl_sfu_recip(y))
SIMPLE_1(float, half_exp, float, val, native_exp(val))
COMPLEX_1(float, half_exp2, float, val, {
result_t result = vc4cl_sfu_exp2(val);
result = vc4cl_is_zero(val) ? (result_t)1.0f : result;
result = val < (result_t)-126.0f ? (result_t)0.0f : result;
result = val >= (result_t)128.0f ? (result_t)INFINITY : result;
return isnan(val) ? val : result;
})
SIMPLE_1(float, half_exp10, float, val, native_exp10(val))
SIMPLE_1(float, half_log, float, val, native_log(val))
COMPLEX_1(float, half_log2, float, val, {
/* log2(x) = log2(M * e^E) = E + log2(M) */
int_t bitcast = vc4cl_bitcast_int(val);
/* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */
int_t exponent = ((bitcast >> 23) & 0xFF) - 126;
/* mask off exponent and replace with exponent for range [0.5, 1) */
int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000;
arg_t mantissa = vc4cl_bitcast_float(tmp);
result_t logMantissa = vc4cl_sfu_log2(mantissa);
result_t result = vc4cl_itof(exponent) + logMantissa;
result = vc4cl_is_inf_nan(val) ? val : result;
result = signbit(val) ? (result_t)nan(0) : result;
result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result;
return result;
})
SIMPLE_1(float, half_log10, float, val, native_log10(val))
SIMPLE_2(float, half_powr, float, x, float, y, powr(x, y))
SIMPLE_1(float, half_recip, float, val, (arg_t)1.0f / val)
COMPLEX_1(float, half_rsqrt, float, val, {
arg_t x = vc4cl_sfu_rsqrt(val);
// rsqrt(+-0) = +-Inf
result_t result = vc4cl_is_zero(val) ? copysign((result_t)INFINITY, val) : x;
// rsqrt(NaN) = NaN
result = isnan(val) ? val : result;
// rsqrt(Inf) = 0
result = isinf(val) ? (result_t)0.0f : result;
// rsqrt(x) for x < 0 = -NaN
result = val < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result;
return result;
})
SIMPLE_1(float, half_sin, float, val, sin(val))
COMPLEX_1(float, half_sqrt, float, val, {
arg_t x = val * vc4cl_sfu_rsqrt(val);
// sqrt(NaN) = NaN, sqrt(Inf) = Inf
result_t result = vc4cl_is_inf_nan(val) ? val : x;
// sqrt(+-0) = +-0
result = vc4cl_is_zero(val) ? val : result;
// sqrt(x) = NaN, for x < 0
return val < 0.0f ? (result_t)nan(0) : result;
})
SIMPLE_1(float, half_tan, float, val, tan(val))
#endif /* VC4CL_MATH_H */