/* * Author: doe300 * * See the file "LICENSE" for the full license governing this code. */ #ifndef VC4CL_MATH_H #define VC4CL_MATH_H #include "_common.h" #include "_intrinsics.h" #include "_float_float.h" // TODO test-cases for all the known Edge Case Behavior // TODO can remove nan-check from rounding functions? The large value check should also contain NaNs, // since VC4 considers NaNs to be the "largest" values // TODO recheck NaN handling for functions based on multiplication/division once they handle NaNs correctly, // maybe we can remove some of those extra codes /* * Remove some macros for more efficient versions */ #undef rsqrt // via own function #undef ceil #undef fabs #undef floor #undef fma #undef rint #undef round #undef trunc #undef half_cos #undef half_divide #undef half_exp #undef half_exp2 #undef half_exp10 #undef half_log #undef half_log2 #undef half_log10 #undef half_powr #undef half_recip #undef half_rsqrt #undef half_sin #undef half_sqrt #undef half_tan #undef native_cos #undef native_divide // SFU RECIP #undef native_exp // via SFU EXP2 #undef native_exp2 // SFU EXP2 #undef native_exp10 // via SFU EXP2 #undef native_log // via SFU LOG2 #undef native_log2 // SFU LOG2 #undef native_log10 // via SFU LOG2 #undef native_powr // via SFU_EXP2 #undef native_recip // SFU RECIP #undef native_rsqrt // SFU RSQRT #undef native_sin #undef native_sqrt // via SFU RECIP and SFU RSQRT #undef native_tan // vc4cl_split(double) of M_LN2 #define M_LN2_FF 0xB102E3083F317218 // 1 / log10(e) #define M_1_LOG10_E_F 2.30258512496948242188f INLINE int factorial(int n) CONST { return (n == 1 || n == 0) ? 1 : factorial(n - 1) * n; } COMPLEX_3_SCALAR(float, fast_pown, float, val, uint, n, uchar, width, { // reference: http://dimacs.rutgers.edu/archive/Workshops/Security/program2/quisquater/node3.html (method I) result_t result = 1.0f; result_t tmp = val; for(uchar i = 0; i < width; ++i) { // if n[i] == 1 then z = z * y result *= ((n >> i) & 1) ? tmp : 1.0f; // y = y * y tmp = tmp * tmp; } return result; }) COMPLEX_1(float, vc4cl_pow2, int, val, { // y = 2^x = 1.0 [implied] * 2^(x + offset) int_t tmp = val << 23; // alternative: tmp = (val + 127) << 23; tmp += (int_t) 0x3F800000; return vc4cl_bitcast_float(tmp & (int_t) 0x7F800000); }) /** * Expected behavior: * * acos(1) = +-0 * acos(x) = NaN for |x| > 1 */ COMPLEX_1(float, acos, float, val, { result_t result = M_PI_2_F - asin(val); result = fabs(val) > (result_t)1.0f ? nan(0) : result; return isnan(val) ? val : result; }) /** * Expected behavior: * * acosh(1) = +0 * acosh(x) = NaN for x < 1 * acosh(+Inf) = +Inf */ // https://en.wikipedia.org/wiki/Inverse_hyperbolic_functions#Series_expansions // or: https://en.wikipedia.org/wiki/Hyperbolic_function#Inverse_functions_as_logarithms // XXX optimize?! SIMPLE_1(float, acosh, float, val, log(val + sqrt(val * val + 1))) /** * Expected behavior: * * acospi(1) = +-0 * acospi(x) = NaN for |x| > 1 */ COMPLEX_1(float, acospi, float, val, { result_t result = acos(val) * M_1_PI_F; result = fabs(val) > (result_t)1.0f ? nan(0) : result; return isnan(val) ? val : result; }) /** * Expected behavior: * * asin(+-0) = +-0 * asin(x) = NaN for |x| > 1 */ COMPLEX_1(float, asin, float, val, { result_t result = atan(val *rsqrt(1 - val * val)); result = fabs(val) > (result_t)1.0f ? nan(0) : result; return isnan(val) ? val : result; }) /** * Expected behavior: * * asinh(+-0) = +-0 * asinh(+-Inf) = +-Inf */ // XXX optimize?! SIMPLE_1(float, asinh, float, val, log(val + sqrt(val * val - 1))) /** * Expected behavior: * * asinpi(+-0) = +-0 * asinpi(x) = NaN for |x| > 1 */ COMPLEX_1(float, asinpi, float, val, { result_t result = asin(val) * M_1_PI_F; result = fabs(val) > (result_t)1.0f ? nan(0) : result; return isnan(val) ? val : result; }) // https://stackoverflow.com/questions/23047978/how-is-arctan-implemented // https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence (too inaccurate!) // http://www2.mae.ufl.edu/~uhk/ARCTAN-APPROX-PAPER.pdf (atan(x) = 1/x * F(1/x)) // http://www.jjj.de/fxt/fxtbook.pdf /** * Expected behavior: * * atan(+-0) = +-0 * atan(+-Inf) = +-pi/2 * * Argument reduction: * 1) atan(-x) = -atan(x) * 2) atan(1/x) = PI/2 - atan(x), x > 0 * 3) atan(1/x) = -PI/2 - atan(x), x < 0 * 4) atan(x) = 2 atan(x / (1 + sqrt(1 + x^2))) * * Matters Computational (section 32.1.2.3): Pade Approximation (6 steps) * Has an error of ~ 0 for |x| <= 1/8 * PI: * (1155∙x + 1190∙x^3 + 231∙x^5)/(1155 + 1575∙x^2+525∙x^4+25∙x^6) * * Alternatively could be calculated via: * atan(x) = atan2(x, 1) */ COMPLEX_1(float, atan, float, val, { // reduce arguments a few times (r = r/(1+sqrt(1+r^2))) uint n = 2; result_t r = val; for(uint i = 0; i < n; ++i) r = r / (1 + sqrt(1 + r * r)); // compute the pade approximation result_t approx = (1155 * r + 1190 * r * r * r + 231 * r * r * r * r * r) / (1155 + 1575 * r * r + 525 * r * r * r * r + 25 * r * r * r * r * r * r); // calculate the result (2^n * approx) result_t result = ((result_t)(2 << (n - 1))) * approx; // atan(x) with x >= 2^26 ~ atan(Inf) => pi/2 result = (fabs(val) >= (result_t)0x1p+26f) ? copysign(M_PI_2_F, val) : result; // atan(+-NaN) = +-NaN return isnan(val) ? val : result; }) /** * https://en.wikipedia.org/wiki/Atan2#Definition_and_computation * * Expected behavior: * * atan2(+-0, -0) = +-pi * atan2(+-0, +0) = +-0 * atan2(+-0, x) = +-pi for x < 0 * atan2(+-0, x) = +-0 for x > 0 * atan2(y, +-0) = -pi/2 for y < 0 * atan2(y, +-0) = pi/2 for y > 0 * atan2(+-y, -Inf) = +-pi for y > 0 and y != Inf * atan2(+-y, +Inf) = +-0 for y > 0 and y != Inf * atan2(+-Inf, x) = +-pi/2 for x != Inf * atan2(+-Inf, -Inf) = +-3/4pi * atan2(+-Inf, +Inf) = +-pi/4 */ COMPLEX_2(float, atan2, float, y, float, x, { // note: these calculation work since the condition is either 0 or all bits set result_t tan = atan(y / x); result_t s = sign(y); int_t xG0 = x > 0; int_t xL0 = x < 0; int_t yEq0 = y == 0; // arctan(y / x) if x > 0 int_t res0 = xG0 & vc4cl_bitcast_int(tan); // arctan(y / x) + sign(y) * pi if x < 0, y != 0 int_t res1 = xL0 & ~yEq0 & vc4cl_bitcast_int(tan + s * M_PI_F); // pi if x < 0, y = 0 int_t res2 = xL0 & yEq0 & vc4cl_bitcast_int(M_PI_F); // sign(y) * pi/2 if x = 0, y != 0 int_t res3 = (x == 0) & ~yEq0 & vc4cl_bitcast_int(s * M_PI_2_F); // undefined if x = 0, y = 0 return vc4cl_bitcast_float(res0 | res1 | res2 | res3); }) /** * Expected behavior: * * atanh(+-0) = +-0 * atanh(+-1) = +-Inf * atanh(x) = NaN for |x| > 1 */ SIMPLE_1(float, atanh, float, val, ((result_t) 0.5f) * log((1 + val) / (1 - val))) /** * Expected behavior: * * atanpi(+-0) = +-0 * atanpi(+-Inf) = +-0.5 */ SIMPLE_1(float, atanpi, float, val, isnan(val) ? val : atan(val) * M_1_PI_F) /** * Expected behavior: * * atan2pi(+-0, -0) = +-1 * atan2pi(+-0, +0) = +-0 * atan2pi(+-0, x) = +-1 for x < 0 * atan2pi(+-0, x) = +-0 for x > 0 * atan2pi(y, +-0) = -0.5 for y < 0 * atan2pi(y, +-0) = 0.5 for y > 0 * atan2pi(+-y, -Inf) = +-1 for y > 0 * atan2pi(+-y, +Inf) = +-0 for y > 0 * atan2pi(+-Inf, x) = +-0.5 * atan2pi(+-Inf, -Inf) = +-0.75 * atan2pi(+-Inf, +Inf) = +-0.25 */ SIMPLE_2(float, atan2pi, float, x, float, y, atan2(x, y) * M_1_PI_F) /** * Expected behavior: * * cbrt(+-0) = +-0 * cbrt(+-Inf) = +-Inf */ // TODO different algorithm? // e.g. http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt (acbrt1 with a few more Newton steps, but requires several // floating-point divisions) SIMPLE_1(float, cbrt, float, val, pow(val, 1.0f / 3.0f)) /** * Expected behavior: * * ceil(+-0) = +-0 * ceil(+-Inf) = +-Inf * ceil(x) = -0 for -1 < x < 0 */ COMPLEX_1(float, ceil, float, val, { //"Round to integer toward + infinity. " // https://stackoverflow.com/questions/12279914/implement-ceil-in-c // http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1 // all floating point values |val| > 2^23 are exact numbers int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f; // |val| < 2^23 fits into integer result_t truncated = vc4cl_itof(vc4cl_ftoi(val)); // if the truncation is smaller than val (positive numbers), add 1 to round up result_t ceiling = truncated + (truncated < val ? (result_t)1.0f: (result_t)0.0f); // fix for ceil([-0.9, -0.0]) = -0.0 result_t result = copysign(ceiling, val); // fix for negative subnormal values result = result < val ? result + (result_t) 1.0f : result; // deciding here which value to return saves us up to two jumps return tooBig ? val : result; }) //"Returns x with its sign changed to match the sign of y. " // sign = y & 0x80000000 // tmp = x & 0x7FFFFFFF // return tmp | sign SIMPLE_2(float, copysign, float, x, float, y, vc4cl_bitcast_float((vc4cl_bitcast_uint(y) & 0x80000000) | (vc4cl_bitcast_uint(x) & 0x7FFFFFFF))) COMPLEX_1(float, cos_pade, float, val, { /* * Pade approximation * (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 86) * * Has a relative error of far less than 2^23 (1 ULP) for [-PI/2, PI/2], then rises fast * * Alternative: * (see alternative for sin_pade) */ arg_t a = 45469.0f * val * val * val * val * val * val * val * val; arg_t b = 7029024.0f * val * val * val * val * val * val; arg_t c = 348731040.0f * val * val * val * val; arg_t d = 5269904640.0f * val * val; arg_t e = 10983772800.0f; arg_t f = 9336.0f * val * val * val * val * val * val; arg_t g = 2064720.0f * val * val * val * val; arg_t h = 221981760.0f * val * val; arg_t up = a - b + c - d + e; arg_t down = f + g + h + e; return up / down; }) /** * Expected behavior: * * cos(+-0) = 1 * cos(+-Inf) = NaN for |x| > 1 */ COMPLEX_1(float, cos, float, val, { /* * OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp * * We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate. */ // TODO normalization into [-pi/2, pi/2] is too inaccurate for large values! // Since cosine has a period of 2pi, these rewrites do not change the result (rounding error excluded): // bring into range [-2pi, 2pi] arg_t modifiedVal = fmod(val, 2.0f * M_PI_F); // bring into range [-pi, pi] modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal; modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal; // We move by half a period, so we must invert the sign of the result // bring into range [-pi/2, pi/2] int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F; modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal; modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal; arg_t result = cos_pade(invertSign ? -modifiedVal : modifiedVal); result = invertSign ? -result : result; result = isinf(val) ? copysign(nan(0), val) : result; return isnan(val) ? val : result; }) /** * Expected behavior: * * cosh(+-0) = 1 * cosh(+-Inf) = +-Inf */ COMPLEX_1(float, cosh, float, val, { /* * https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence * * This version (the Taylor-series up to x^16/16!) has following accuracy: * * cosh(PI) -> error of 1.5*10^-7 < ulp(PI) = 2*10^-7 * cosh(0.5f) -> error of < 10^-9 < ulp(0.5f) = 5*10^-8 * cosh(0.0f) -> error of < 10^-46 < ulp(0.0f) = 1*10^-45 * * OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp */ const result_t fac2 = 1.0f / factorial(2); const result_t fac4 = 1.0f / factorial(4); const result_t fac6 = 1.0f / factorial(6); const result_t fac8 = 1.0f / factorial(8); const result_t fac10 = 1.0f / factorial(10); const result_t fac12 = 1.0f / factorial(12); const result_t fac14 = 1.0f / factorial(14); const result_t fac16 = 1.0f / factorial(16); /* * Argument reduction * * Matters Computational, section 32.2.3 */ const uint n = 12; // r = x / 2^n const result_t inv2N = 1.0f / (1 << n); result_t r = val * inv2N; // C = cosh(r) - 1 result_t x = r; result_t x2 = x * x; result_t tmp1 = x2 * fac2; // cosh(x) = 1 + x^2/2! + ... result_t tmp = 1 + tmp1; result_t x4 = x2 * x * x; tmp1 = x4 * fac4; //... + x^4/4! + ... tmp = x + tmp1; result_t x6 = x4 * x * x; tmp1 = x6 * fac6; //... + x^6/6! + ... tmp = tmp + tmp1; result_t x8 = x6 * x * x; tmp1 = x8 + fac8; //... + x^8/8! + ... tmp = tmp + tmp1; result_t x10 = x8 * x * x; tmp1 = x10 * fac10; //... + x^10/10! + ... tmp = tmp + tmp1; result_t x12 = x10 * x * x; tmp1 = x12 + fac12; //... + x^12/12! + ... tmp = tmp + tmp1; result_t x14 = x12 * x * x; tmp1 = x14 * fac14; //... + x^14/14! + ... tmp = tmp + tmp1; result_t x16 = x14 * x * x; tmp1 = x16 + fac16; //... + x^16/16! tmp = tmp + tmp1; result_t C = tmp - 1; for(uint i = 0; i < n; ++i) C = 2 * (C + 1) * (C + 1) - 2; return C + 1; }) /** * Expected behavior: * * cospi(+-0) = 1 * cospi(x) = +0 for x = n + 0.5 and n is integer * cospi(+-Inf) = NaN */ SIMPLE_1(float, cospi, float, val, cos(val *M_PI_F)) /** * Expected behavior: * * erfc(-Inf) = 2 * erfc(+Inf) = 0 */ SIMPLE_1(float, erfc, float, x, 1 - erf(x)) /** * Expected behavior: * * erf(+-0) = +-0 * erf(+-Inf) = +-1 */ COMPLEX_1(float, erf, float, x, { /* * https://en.wikipedia.org/wiki/Error_function#Numerical_approximations * * has a maximal error of 1.2 * 10^-7 * * alternative: http://mathworld.wolfram.com/Erf.html */ const result_t t = 1 / (1 + 0.5f * fabs(x)); const result_t r = t * exp(-x * x - 1.26551223f + 1.00002368f * t + 0.37409196f * t * t + 0.09678418f * t * t * t - 0.18628806f * t * t * t * t + 0.27886807f * t * t * t * t * t - 1.13520398f * t * t * t * t * t * t + 1.48851587f * t * t * t * t * t * t * t - 0.82215223f * t * t * t * t * t * t * t * t + 0.17087277f * t * t * t * t * t * t * t * t * t); return x < 0 ? r - 1 : 1 - r; }) /** * Expected behavior: * * exp(+-0) = 1 * exp(-Inf) = 0 * exp(+Inf) = +Inf * * Chebyshev interpolation with monomial basis and range reduction, * * https://www.pseudorandom.com/implementing-exp#section-18 */ COMPLEX_1(float, exp, float, val, { arg_t positive = fabs(val); // range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] // Use floor() instead of ceil() to not reach Inf later on for e^(~10^38) int_t k = vc4cl_ftoi((positive / M_LN2_F) - 0.5f); arg_t kFloat = vc4cl_itof(k); // calculate in extended precision to reduce rounding error arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF))); arg_t pn = 1.143364767943110e-11; pn = pn * r + 1.632461784798319e-10f; pn = pn * r + 2.088459690899721e-9f; pn = pn * r + 2.504861486483735e-8f; pn = pn * r + 2.755715675968011e-7f; pn = pn * r + 2.755734045527853e-6f; pn = pn * r + 2.480158866546844e-5f; pn = pn * r + 1.984126978734782e-4f; pn = pn * r + 0.001388888888388f; pn = pn * r + 0.008333333333342f; pn = pn * r + 0.041666666666727f; pn = pn * r + 0.166666666666680f; pn = pn * r + 0.500000000000002f; pn = pn * r + 1.000000000000000f; pn = pn * r + 1.000000000000000f; pn = pn * vc4cl_pow2(k); result_t result = val < 0 ? 1 / pn : pn; result = vc4cl_is_zero(val) ? (result_t)1.0f : result; result = val <= (result_t)-88.0f ? (result_t)0.0f : result; result = val >= (result_t)89.0f ? (result_t)INFINITY : result; return isnan(val) ? val : result; }) /** * Expected behavior: * * exp2(+-0) = 1 * exp2(-Inf) = 0 * exp2(+Inf) = +Inf */ COMPLEX_1(float, exp2, float, val, { // 2^x = e^(x * ln(2)) result_t result = exp(val * M_LN2_F); return isnan(val) ? val : result; }) /** * Expected behavior: * * exp10(+-0) = 1 * exp10(-Inf) = 0 * exp10(+Inf) = +Inf */ COMPLEX_1(float, exp10, float, val, { // 10^x = e^(x / log10(e)) result_t result = exp(val * M_1_LOG10_E_F); return isnan(val) ? val : result; }) /** * Expected behavior: * * expm1(+-0) = +-0 * expm1(-Inf) = -1 * expm1(+Inf) = +Inf */ // e^x - 1.0 COMPLEX_1(float, expm1, float, val, { result_t result = exp(val) - 1.0f; return isnan(val) ? val : result; }) /** * Expected behavior: * * fabs(+-0) = +0 * fabs(+-Inf) = +Inf */ SIMPLE_1(float, fabs, float, val, vc4cl_fmaxabs(val, 0.0f)) /** * Expected behavior: * * fdim(x, NaN) = NaN * fdim(NaN, y) = NaN */ //"x-y if x>y, +0 if x is less than or equal to y." COMPLEX_2(float, fdim, float, x, float, y, { result_t result = x > y ? x - y : 0.0f; result = isnan(x) ? x : result; return isnan(y) ? y : result; }) /** * Expected behavior: * * floor(+-0) = +-0 * floor(+-Inf) = +-Inf */ COMPLEX_1(float, floor, float, val, { //" Round to integer toward negative infinity. " // https://stackoverflow.com/questions/12279914/implement-ceil-in-c // http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1 // all floating point values |val| > 2^23 are exact numbers int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f; // |val| < 2^23 fits into integer result_t truncated = vc4cl_itof(vc4cl_ftoi(val)); // if the truncation is greater than val (negative numbers), subtract 1 to round down result_t floor_val = truncated - (truncated > val ? (result_t)1.0f : (result_t)0.0f); // deciding here which value to return saves us up to two jumps return tooBig ? val : floor_val; }) /** * Expected behavior: * * fma(x, y, z) = xy + z * fma(x, y, z) = NaN for x = 0 or y = 0 * fma(x, y, z)= NaN for x = Inf, y = Inf, z = Inf */ // TODO not "infinitely precise product" and maybe not "correctly rounded" SIMPLE_3(float, fma, float, a, float, b, float, c, (a * b) + c) // The fmax/min ALU operations (vc4cl_fmax/vc4cl_fmin) handle NaNs as bigger then all, we need to handle them as: // " If one argument is a NaN, fmin/fmax() returns the other argument. If both arguments are NaNs, fmin/fmax() returns a NaN." SIMPLE_2(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y)) SIMPLE_2_SCALAR(float, fmax, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmax(x, y)) SIMPLE_2(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y)) SIMPLE_2_SCALAR(float, fmin, float, x, float, y, isnan(x) ? y : isnan(y) ? x : vc4cl_fmin(x, y)) /** * Expected behavior: * * fmod(+-0, y) = +-0 for y != 0 * fmod(x, y) = NaN for x = +-Inf or y = 0 * fmod(x, +-Inf) = x * fmod(+-0, NaN) = NaN */ //"Modulus. Returns x - y * trunc(x/y)" COMPLEX_2(float, fmod, float, x, float, y, { result_t result = x - y * trunc(x / y); result = (isinf(x) || sign(y) == (result_t)0.0f) ? (result_t)nan(0) : result; result = (isinf(y) || isnan(x)) ? x : result; return isnan(y) ? y : result; }) /** * Expected behavior: * * 0 <= fract(x, iptr) < 1 * fract(+0, iptr) = +0, iptr = +0 * fract(-0, iptr) = -0, iptr = -0 * fract(+Inf, iptr) = +0, iptr = +Inf * fract(-Inf, iptr) = -0, iptr = -Inf * fract(NaN, iptr) = NaN, iptr = NaN */ COMPLEX_2(float, fract, float, val, __global float, *iptr, { //"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr." result_t tmp = floor(val); *iptr = tmp; tmp = fmin(val - tmp, 0x1.fffffep-1f); return isnan(val) ? val : tmp; }) COMPLEX_2(float, fract, float, val, __local float, *iptr, { //"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr." result_t tmp = floor(val); *iptr = tmp; tmp = fmin(val - tmp, 0x1.fffffep-1f); return isnan(val) ? val : tmp; }) COMPLEX_2(float, fract, float, val, __private float, *iptr, { //"Returns fmin( x – floor (x), 0x1.fffffep-1f ). floor(x) is returned in iptr." result_t tmp = floor(val); *iptr = tmp; tmp = fmin(val - tmp, 0x1.fffffep-1f); return isnan(val) ? val : tmp; }) /** * Expected behavior: * * frexp(+-0, exp) = +-0, exp = 0 * frexp(+-Inf, exp) = +-Inf, exp = 0 * frexp(NaN, exp) = NaN, exp = 0 */ COMPLEX_2(float, frexp, float, x, __global int, *exp, { // Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF; int_t exponent = ((absBits >> 23) & 0xFF) - 126; // we need to support subnormals here, since they are only used as input, not output int_t mantissa = absBits & (int_t) 0x007FFFFF; int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23)); // the next lines are required to fix-up the mantissa for our added subnormal exponent int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset); mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0); exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = mantissa | (int_t)0x3F000000; int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x); *exp = specialCase ? (int_t)0 : exponent; return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x); }) COMPLEX_2(float, frexp, float, x, __local int, *exp, { // Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF; int_t exponent = ((absBits >> 23) & 0xFF) - 126; // we need to support subnormals here, since they are only used as input, not output int_t mantissa = absBits & (int_t) 0x007FFFFF; int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23)); // the next lines are required to fix-up the mantissa for our added subnormal exponent int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset); mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0); exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = mantissa | (int_t)0x3F000000; int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x); *exp = specialCase ? (int_t)0 : exponent; return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x); }) COMPLEX_2(float, frexp, float, x, __private int, *exp, { // Adapted from https://git.musl-libc.org/cgit/musl/tree/src/math/frexpf.c int_t absBits = vc4cl_bitcast_int(x) & (int_t)0x7FFFFFFF; int_t exponent = ((absBits >> 23) & 0xFF) - 126; // we need to support subnormals here, since they are only used as input, not output int_t mantissa = absBits & (int_t) 0x007FFFFF; int_t subnormalExpOffset = (vc4cl_clz(mantissa) - (32 - 23)); // the next lines are required to fix-up the mantissa for our added subnormal exponent int_t subnormalExpOffsetLog2 = 33 - vc4cl_clz(subnormalExpOffset); mantissa = mantissa << (exponent == -126 ? subnormalExpOffsetLog2 : 0); exponent = exponent == -126 ? -subnormalExpOffset - 126 : exponent; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = mantissa | (int_t)0x3F000000; int_t specialCase = vc4cl_is_zero(x) || vc4cl_is_inf_nan(x); *exp = specialCase ? (int_t)0 : exponent; return specialCase ? x : copysign(vc4cl_bitcast_float(tmp), x); }) /** * Expected behavior: * * hypot(x, y) = hypot(y, x) = hypot(x, -y) * hypot(x, +-0) = fabs(x) * hypot(+-Inf, y) = +Inf */ SIMPLE_2(float, hypot, float, x, float, y, sqrt(x *x + y * y)) /** * Expected behavior (C99 standard): * * ilogb(0) = FP_ILOGB0 * ilogb(+-Inf) = INT_MAX * ilogb(+-NaN) = FP_ILOGBNAN */ COMPLEX_1(int, ilogb, float, x, { //"Return the exponent as an integer value." // https://en.wikipedia.org/wiki/Single-precision_floating-point_format result_t bitcast = vc4cl_bitcast_int(x); // deduct exponent offset int_t result = ((bitcast >> 23) & 0xFF) - 127; // we need to support subnormals here, since they are only used as input, not output int_t mantissa = bitcast & (int_t) 0x007FFFFF; result = result == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : result; result = vc4cl_is_zero(x) ? (result_t)FP_ILOGB0 : result; result = isinf(x) ? (result_t)INT_MAX : result; return isnan(x) ? (result_t)FP_ILOGBNAN : result; }) /** * Expected behavior: * * ldexp(+-0, n) = +-0 * ldexp(x, 0) = x * ldexp(+-Inf, n) = +-Inf */ //"Multiply x by 2 to the power k." COMPLEX_2(float, ldexp, float, x, int, k, { // Taken from MESA (https://gitlab.freedesktop.org/mesa/mesa/-/blob/master/src/compiler/nir/nir_opt_algebraic.py) // Alternatively in https://git.musl-libc.org/cgit/musl/tree/src/math/scalbnf.c int_t exp = min(max(k, -254), 254); int_t pow2_1 = ((exp >> 1) + 127) << 23; int_t pow2_2 = ((exp - (exp >> 1)) + 127) << 23; result_t result = x * vc4cl_bitcast_float(pow2_1) * vc4cl_bitcast_float(pow2_2); return vc4cl_is_inf_nan(x) ? x : result; }) SIMPLE_2_SCALAR(float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof(1 << k) : 1.0f / vc4cl_itof(1 << -k))) /** * Expected behavior: * * lgamma(1) = 0 * lgamma(2) = 0 * lgamma(x) = +Inf for x <= 0 and x integer * lgamma(-Inf) = +Inf * lgamma(+Inf) = +Inf */ COMPLEX_1(float, lgamma, float, val, { //"Returns the natural logarithm of the absolute value of the gamma function" // see Numerical Recipes, chapter 6.1 // has error of < 2 * 10^-10 // XXX only for val > 0 // TODO far too inaccurate/wrong? result_t x; result_t y; result_t tmp; result_t ser; y = x = val; tmp = x + 5.5f; tmp -= (x + 0.5f) * log(tmp); ser = 1.000000000190015f; y += 1; ser += 76.18009172947146f / y; y += 1; ser += -86.50532032941677f / y; y += 1; ser += 24.01409824083091f / y; y += 1; ser += -1.231739572450155f / y; y += 1; ser += 0.1208650973866179e-2f / y; y += 1; ser += -0.5395239384953e-5f / y; return -tmp + log((2.5066282746310005f) * ser / x); }) /** * Expected behavior: * * lgamma_r(x, signp) -> signp = 0 for x < 0 */ COMPLEX_2(float, lgamma_r, float, x, __global int, *signp, { // TODO better way? result_t tmp = lgamma(x); *signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1; return lgamma(x < 0.0f ? (-1.0f * x) : x); }) COMPLEX_2(float, lgamma_r, float, x, __local int, *signp, { // TODO better way? result_t tmp = lgamma(x); *signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1; return lgamma(x < 0.0f ? (-1.0f * x) : x); }) COMPLEX_2(float, lgamma_r, float, x, __private int, *signp, { // TODO better way? result_t tmp = lgamma(x); *signp = tmp < 0.0f ? -1 : (tmp == 0.0f) ? 0 : 1; return lgamma(x < 0.0f ? (-1.0f * x) : x); }) COMPLEX_1(float, log1p_pade, float, val, { /* * Calculates ln(1 + x) via Pade approximation * (https://ir.canterbury.ac.nz/bitstream/handle/10092/8886/brookes_thesis.pdf?sequence=1, page 72/96) * * Has a relative error of less than 2*2^23 (2 ULP) for range [-0.5, 2], then rises fast */ // g(x) = (49x^6+1218x^5+7980x^4+20720x^3+23100x^2+9240x)/(10x^6+420x^5+4200x^4+16800x^3+31500x^2+27720x+9240) arg_t a = 49.0f * val * val * val * val * val * val; arg_t b = 1218.0f * val * val * val * val * val; arg_t c = 7980.0f * val * val * val * val; arg_t d = 20720.0f * val * val * val; arg_t e = 23100.0f * val * val; arg_t f = 9240.0f * val; arg_t g = 10.0f * val * val * val * val * val * val; arg_t h = 420.0f * val * val * val * val * val; arg_t i = 4200.0f * val * val * val * val; arg_t k = 16800.0f * val * val * val; arg_t l = 31500.0f * val * val; arg_t m = 27720.0f * val; arg_t n = 9240.0f; arg_t up = a + b + c + d + e + f; arg_t down = g + h + i + k + l + m + n; return up / down; }) /** * Expected behavior: * * log2(+-0) = -Inf * log2(1) = 0 * log2(x) = Nan for x < 0 * log2(+Inf) = +Inf */ COMPLEX_1(float, log2, float, val, { /* log2(x) = log2(M * e^E) = E + log2(M) */ int_t bitcast = vc4cl_bitcast_int(val); /* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */ int_t exponent = ((bitcast >> 23) & 0xFF) - 126; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000; arg_t mantissa = vc4cl_bitcast_float(tmp); /* * Alternatively could use SFU and Newton-Raphson steps: * f(x) = log2(x) - y * root: f(x) = 0 -> log2(x) = y * f'(x) = 1/(x * ln(2)) * xn+1 = xn - (log2(xn) - y) / (1/(xn * ln(2))) * xn+1 = xn - xn * ln(2) * (log2(x) - y) * => but don't know y which is exact log2(x)!!! * * Alternatively could approximate root of: f(y) = 2^y - x, need exact exp2(x) */ // result_t logMantissa = vc4cl_sfu_log2(mantissa); // log2(M*2^E) = E + log2(M) = E + ln(M) / ln(2) = E + ln1p(M - 1) / ln(2) result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN2_F; result_t result = vc4cl_itof(exponent) + logMantissa; result = vc4cl_is_inf_nan(val) ? val : result; result = signbit(val) ? (result_t)nan(0) : result; result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result; return result; }) /** * Expected behavior: * * log(+-0) = -Inf * log(1) = 0 * log(x) = Nan for x < 0 * log(+Inf) = +Inf */ COMPLEX_1(float, log, float, val, { /* * Other sources/calculations: * - Matters computational (sections 32.1.1 and 32.1.3) * - https://math.stackexchange.com/questions/977586/is-there-an-approximation-to-the-natural-log-function-at-large-values * - https://math.stackexchange.com/questions/1382070/iterative-calculation-of-log-x * - https://evoq-eval.siam.org/Portals/0/Publications/SIURO/Vol7/New_Method_for_Approximating_Logarithms.pdf?ver=2018-04-06-151916-493 * - https://cs.stackexchange.com/questions/91185/compute-ex-given-starting-approximation * - https://shoueiko.wordpress.com/2017/05/01/approximation-of-log-2/ * - https://math.stackexchange.com/questions/3381629/what-is-the-fastest-algorithm-for-finding-the-natural-logarithm-of-a-big-number * - https://stackoverflow.com/questions/39821367/very-fast-approximate-logarithm-natural-log-function-in-c */ /* log(M * 2^E) = log(M) + E log(2) */ int_t bitcast = vc4cl_bitcast_int(val); /* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */ int_t exponent = ((bitcast >> 23) & 0xFF) - 126; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000; arg_t mantissa = vc4cl_bitcast_float(tmp); result_t logMantissa = log1p_pade(mantissa - 1.0f); result_t result = vc4cl_itof(exponent) * M_LN2_F + logMantissa; result = vc4cl_is_inf_nan(val) ? val : result; result = signbit(val) ? (result_t)nan(0) : result; result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result; return result; }) /** * Expected behavior: * * log10(+-0) = -Inf * log10(1) = 0 * log10(x) = Nan for x < 0 * log10(+Inf) = +Inf */ COMPLEX_1(float, log10, float, val, { /* log10(M * 2^E) = log10(M) + E log10(2) */ int_t bitcast = vc4cl_bitcast_int(val); /* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */ int_t exponent = ((bitcast >> 23) & 0xFF) - 126; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000; arg_t mantissa = vc4cl_bitcast_float(tmp); /* log10(M) = log(M) / log(10) */ result_t logMantissa = log1p_pade(mantissa - 1.0f) / M_LN10_F; const arg_t log2_10 = M_LN2_F / M_LN10_F; result_t result = vc4cl_itof(exponent) * log2_10 + logMantissa; result = vc4cl_is_inf_nan(val) ? val : result; result = signbit(val) ? (result_t)nan(0) : result; result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result; return result; }) /** * Expected behavior: * * log1p(+-0) = +-0 * log1p(-1) = -Inf * log1p(x) = Nan for x < -1 * log1p(+Inf) = +Inf */ //TODO directly use log1p_pade(), needs argument reduction COMPLEX_1(float, log1p, float, val, { result_t result = log(1.0f + val); result = vc4cl_is_zero(val) ? val : result; result = val == (arg_t)-1.0f ? (result_t)-INFINITY : result; result = val < (arg_t)-1.0f ? (result_t)nan(0) : result; result = vc4cl_is_inf_nan(val) ? val : result; return result; }) /** * Expected behavior: * * logb(+-0) = -Inf * logb(+-Inf) = +Inf */ COMPLEX_1(float, logb, float, x, { int_t bitcast = vc4cl_bitcast_int(x); // deduct exponent offset int_t exponent = ((bitcast >> 23) & 0xFF) - 127; // we need to support subnormals here, since they are only used as input, not output int_t mantissa = bitcast & (int_t) 0x007FFFFF; exponent = exponent == -127 ? -(vc4cl_clz(mantissa) - (32 - 23)) - 127 : exponent; result_t result = vc4cl_itof(exponent); result = vc4cl_is_zero(x) ? (result_t)-INFINITY : result; result = isinf(x) ? fabs(x) : result; return isnan(x) ? x : result; }) SIMPLE_3(float, mad, float, a, float, b, float, c, (a * b) + c) //"Returns x if |x|>|y|, y if |y|>|x|, otherwise fmax(x, y)" COMPLEX_2(float, maxmag, float, x, float, y, { // explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block result_t tmp = fmax(x, y); result_t other = fabs(y) > fabs(x) ? y : tmp; return fabs(x) > fabs(y) ? x : other; }) //"Returns x if |x|<|y|, y if |y|<|x|, otherwise fmin(x, y)" COMPLEX_2(float, minmag, float, x, float, y, { // explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block result_t tmp = fmin(x, y); result_t other = fabs(y) < fabs(x) ? y : tmp; return fabs(x) < fabs(y) ? x : other; }) /** * Expected behavior: * * modf(+-x, iptr) = value with same sign as x * modf(+-Inf, iptr) = +-0, iptr = +-Inf * modf(NaN, iptr) = NaN, iptr = NaN */ // taken OpenCL C 1.2 specification, 7.5.2. Changes to C99 TC2 Behavior COMPLEX_2(float, modf, float, value, __global float, *iptr, { result_t tmp = trunc(value); *iptr = isnan(value) ? value : tmp; result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value); return isnan(value) ? value : result; }) COMPLEX_2(float, modf, float, value, __local float, *iptr, { result_t tmp = trunc(value); *iptr = isnan(value) ? value : tmp; result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value); return isnan(value) ? value : result; }) COMPLEX_2(float, modf, float, value, __private float, *iptr, { result_t tmp = trunc(value); *iptr = isnan(value) ? value : tmp; result_t result = copysign(isinf(value) ? (result_t)0.0f : value - tmp, value); return isnan(value) ? value : result; }) SIMPLE_1(float, nan, uint, nancode, vc4cl_bitcast_float(NAN | nancode)) /** * Expected behavior: * * nextafter(+smallest normal, y < +smallest normal) = +0 * nextafter(-smallest normal, y > -smallest normal) = -0 * nextafter(-0, y > 0) = smallest positive normal value * nextafter(0, y < 0) = smallest negative normal value */ COMPLEX_2(float, nextafter, float, x, float, y, { int_t ix = vc4cl_bitcast_int(x); int_t iy = vc4cl_bitcast_int(y); /* x > y -> x -= ulp otherwise x += ulp */ int_t xPos = ix > iy ? ix - 1 : ix + 1; /* x < y -> x -= ulp otherwise x += ulp */ int_t xNeg = iy >= 0 || ix > iy ? ix - 1 : ix + 1; int_t xNotY = ix >= 0 ? /* x > 0 */ xPos : /* x < 0 */ xNeg; int_t res = x == y ? iy : xNotY; result_t result = vc4cl_bitcast_float(res); result = (x == (result_t)-0.0f && y > 0.0f) ? (result_t)FLT_MIN : result; result = (x == (result_t)0.0f && y < 0.0f) ? (result_t)-FLT_MIN : result; result = (x == (result_t)FLT_MIN && y < (result_t)FLT_MIN) ? (result) +0.0f : result; result = (x == (result_t)-FLT_MIN && y > (result_t)-FLT_MIN) ? (result) -0.0f : result; return isnan(x) ? x : (isnan(y) ? y : result); }) /** * Expected behavior: * * pow(+-0, y) = +-Inf for y odd and y < 0 * pow(+-0, y) = +Inf for y even and y < 0 * pow(+-0, y) = +-0 for y odd and y > 0 * pow(+-0, y) = 0 for y even and y > 0 * pow(-1, +-Inf) = 1 * pow(1, y) = 1 * pow(x, +-0) = 1 * pow(x, y) = NaN for x < 0 and y not an integer * pow(x, -Inf) = +Inf for |x| < 0 * pow(x, -Inf) = 0 for |x| > 0 * pow(x, +Inf) = 0 for |x| < 0 * pow(x, +Inf) = 0 for |x| > 0 * pow(-Inf, y) = -0 for y odd and y < 0 * pow(-Inf, y) = 0 for y even and y < 0 * pow(-Inf, y) = -Inf for y odd and y > 0 * pow(-Inf, y) = +Inf for y even and y > 0 * pow(+Inf, y) = 0 for y < 0 * pow(+Inf, y) = +Inf for y > 0 * pow(+-0, -Inf) =+Inf */ // for pow, see also https://stackoverflow.com/questions/4518011/algorithm-for-powfloat-float COMPLEX_2(float, pow, float, x, float, y, { result_t tmp = powr(x, y); return y < 0.0f ? (result_t) 1.0f / tmp : tmp; }) /** * Expected behavior: * * pown(x, 0) = 1 * pown(+-0, n) = +-Inf for odd n and n < 0 * pown(+-0, n) = +Inf for even n and n < 0 * pown(+-0, n) = +-0 for odd n and n > 0 * pown(+-0, n) = 0 for even n and n > 0 */ // TODO is there a way to not need to calculate both versions? Or at least calculate both at once?! SIMPLE_2(float, pown, float, x, int, n, n < 0 ? (1.0f / fast_pown(x, vc4cl_bitcast_uint(-n), 32)) : fast_pown(x, vc4cl_bitcast_uint(n), 32)) /** * Expected behavior: * * powr(x, +-0) = 1 for x > 0 * powr(+-0, y) = +Inf for y < 0 * powr(+-0, -Inf) = +Inf * powr(+-0, y) = +0 for y > 0 * powr(1, y) = 1 * powr(x, y) = NaN for x < 0 * powr(+-0, +-0) = NaN * powr(+Inf, +-0) = NaN * powr(1, +-Inf) = NaN * powr(x, NaN) = NaN for x >= 0 * powr(NaN, y) = NaN */ //"Compute x to the power y, where x is >= 0." // x^y = e^(y * ln(x)) SIMPLE_2(float, powr, float, x, float, y, exp(y *log(x))) /** * Expected behavior: * * remainder(x, 0) = NaN or 0 * remainder(Inf, y) = NaN * remainder(x, Inf) = x */ COMPLEX_2(float, remainder, float, x, float, y, { // TODO correct? result_t k = rint(x / y); result_t result = x - k * y; result = vc4cl_is_zero(y) ? (result_t)-nan(0) : result; result = isinf(x) ? (result_t)nan(0) : result; result = isnan(x) ? x : result; return isnan(y) ? y : result; }) /** * Expected behavior: * * remquo(x, y, quo) = NaN, quo = 0 for x = +-Inf or y = 0 or x = NaN or y = NaN * exp2(-Inf) = 0 * exp2(+Inf) = +Inf */ // taken from pocl (https://github.com/pocl/pocl/blob/master/lib/kernel/vecmathlib-pocl/remquo.cl) COMPLEX_3(float, remquo, float, x, float, y, __global int, *quo, { result_t k = rint(x / y); *quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k)); return x - k * y; }) COMPLEX_3(float, remquo, float, x, float, y, __local int, *quo, { result_t k = rint(x / y); *quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k)); return x - k * y; }) COMPLEX_3(float, remquo, float, x, float, y, __private int, *quo, { result_t k = rint(x / y); *quo = (vc4cl_ftoi(k) & 0x7F) * (1 - 2 * signbit(k)); return x - k * y; }) /** * Expected behavior: * * rint(x) = -0 for -0.5 <= x < 0 */ COMPLEX_1(float, rint, float, val, { //" Round to nearest even integer. " // round like round, but decides on nearest even for halfway cases // TODO always round to nearest even (e.g. 0.4 to 2.0) or just for half-way cases? // https://stackoverflow.com/questions/12279914/implement-ceil-in-c // http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1 int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f; // |val| < 2^23 fits into integer int_t integer = vc4cl_ftoi(val); result_t truncated = vc4cl_itof(integer); // for negative numbers, truncated is larger than value, for positive it is smaller result_t diff = val - truncated; // -> if the |difference| is > 0.5, subtract/add 1 result_t correction = 0.0f + (diff < -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff > 0.5f ? (result_t)1.0f : (result_t)0.0f); // if the value is exactly the half, round to nearest even int_t isHalfWay = diff == 0.5f || diff == -0.5f; // if the truncated integer is even, use it. Otherwise (lsb of integer is 1) subtract/add 1 result_t halfWayCorrection = 0 + vc4cl_itof(integer & 1) * sign(val); result_t result = truncated + correction + vc4cl_itof(isHalfWay) * halfWayCorrection; // deciding here which value to return saves us up to two jumps return tooBig ? val : result; }) /** * Expected behavior: * * rootn(+-0, n) = +-Inf for odd n < 0 * rootn(+-0, n) = +Inf for even n < 0 * rootn(+-0, n) = +-0 for odd n > 0 * rootn(+-0, n) = +0 for even n > 0 * rootn(x, n) = NaN for x < 0 and n even * rootn(x, 0) = NaN */ // TODO different algorithm? SIMPLE_2(float, rootn, float, x, int, y, pow(x, (arg0_t) 1.0f / vc4cl_itof(y))) // see: https://en.wikipedia.org/wiki/Nth_root_algorithm /** * Expected behavior: * * round(+-0) = +-0 * round(+-Inf) = +-Inf * round(x) = -0 for -0.5 < x < 0 */ COMPLEX_1(float, round, float, val, { //" Return the integral value nearest to x rounding halfway cases away from zero, regardless of the current rounding // direction. " // https://stackoverflow.com/questions/12279914/implement-ceil-in-c // http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1 int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f; // |val| < 2^23 fits into integer result_t truncated = vc4cl_itof(vc4cl_ftoi(val)); // for negative numbers, truncated is larger than value, for positive it is smaller result_t diff = val - truncated; // -> if the |difference| is >= 0.5, subtract/add 1 result_t correction = 0.0f + (diff <= -0.5f ? (result_t)-1.0f : (result_t)0.0f) + (diff >= 0.5f ? (result_t)1.0f : (result_t)0.0f); result_t result = truncated + correction; // deciding here which value to return saves us up to two jumps return tooBig ? val : result; }) COMPLEX_1(float, rsqrt, float, x, { // Using Goldschmidt's algorithm with 2 iterations // see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9) arg_t y = vc4cl_sfu_rsqrt(x); arg_t g = x * y; arg_t h = y * (arg_t)0.5f; for(unsigned i = 0; i < 2; ++i) { arg_t r = (arg_t)0.5f - g * h; g = g + g * r; h = h + h * r; } // rsqrt(+-0) = +-Inf result_t result = vc4cl_is_zero(x) ? copysign((result_t)INFINITY, x) : (arg_t)2.0f * h; // rsqrt(NaN) = NaN result = isnan(x) ? x : result; // rsqrt(Inf) = 0 result = isinf(x) ? (result_t)0.0f : result; // rsqrt(x) for x < 0 = -NaN result = x < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result; return result; }) COMPLEX_1(float, sin_pade, float, val, { /* * Pade approximation * (https://en.wikipedia.org/wiki/Pad%C3%A9_approximant#Examples) * * Has a relative error of less than 2^23 (1 ULP) for range [-PI/2, PI/2], then rises fast * * Alternative: * Mendenhall algorithm to calculate both sine and cosine: * - Is also exact enough for range [-PI/2, PI/2] * - https://arxiv.org/pdf/cs/0406049.pdf * - * https://github.com/broadcomCM/android_hardware_broadcom_brcm_usrlib/blob/ics/dag/vmcsx/middleware/khronos/glsl/glsl_mendenhall.c */ arg_t a = 12671.0f / 4363920.0f; arg_t b = 2363.0f / 18183.0f; arg_t c = 445.0f / 12122.0f; arg_t d = 601.0f / 872784.0f; arg_t e = 121.0f / 16662240.0f; arg_t up = (a * val * val * val * val * val) - (b * val * val * val) + val; arg_t down = 1.0f + (c * val * val) + (d * val * val * val * val) + (e * val * val * val * val * val * val); return up / down; }) COMPLEX_1(float, sin_fast, float, x, { /* * Alternative implementation, used by Mesa3D * - less instructions to calculate * - has greater error (even for range -pi/2, pi/2) * https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/compiler/nir/nir_opt_algebraic.py * https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 */ const arg_t B = 4.0 / M_PI; const arg_t C = -4.0 / (M_PI * M_PI); arg_t y = B * x + C * x * fabs(x); // const float Q = 0.775; const arg_t P = (arg_t)0.225; y = P * (y * fabs(y) - y) + y; // Q * y + P * y * abs(y) return y; }) //COMPLEX_1(float, sin_taylor, float, x, { /* * Alternative implementation, used by Mesa3D VC4 driver * - calculates the Taylor series * - * https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/gallium/drivers/vc4/vc4_program.c (function ntq_fsin) */ //}) /** * Expected behavior: * * sin(+-0) = +-0 * sin(+-Inf) = NaN for |x| > 1 */ COMPLEX_1(float, sin, float, val, { /* * OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp * * We use argument reduction to bring it into range [-pi/2, pi/2] in which range the Pade approximation is accurate. */ // TODO normalization into [-pi/2, pi/2] is too inaccurate for large values! fmod too inaccurate (~8 ULP, should be 0) // Since sine has a period of 2pi, these rewrites do not change the result (rounding error excluded): // bring into range [-2pi, 2pi] arg_t modifiedVal = fmod(val, 2.0f * M_PI_F); // bring into range [-pi, pi] modifiedVal = modifiedVal < -M_PI_F ? modifiedVal + (2.0f * M_PI_F) : modifiedVal; modifiedVal = modifiedVal > M_PI_F ? modifiedVal - (2.0f * M_PI_F) : modifiedVal; // We move by half a period, so we must invert the sign of the result // bring into range [-pi/2, pi/2] int_t invertSign = modifiedVal < -M_PI_2_F || modifiedVal > M_PI_2_F; modifiedVal = modifiedVal < -M_PI_2_F ? modifiedVal + M_PI_F : modifiedVal; modifiedVal = modifiedVal > M_PI_2_F ? modifiedVal - M_PI_F : modifiedVal; arg_t result = sin_pade(modifiedVal); return invertSign ? -result : result; }) SIMPLE_2(float, sincos, float, x, __global float, *cosval, (*cosval = cos(x), sin(x))) SIMPLE_2(float, sincos, float, x, __local float, *cosval, (*cosval = cos(x), sin(x))) SIMPLE_2(float, sincos, float, x, __private float, *cosval, (*cosval = cos(x), sin(x))) /** * Expected behavior: * * sinh(+-0) = +-0 * sinh(+-Inf) = +-Inf */ COMPLEX_1(float, sinh, float, val, { /* * https://en.wikipedia.org/wiki/Taylor_series#Approximation_and_convergence * * This version (the Taylor-series up to x^17/17!) has following accuracy: * * sinh(PI) -> error of 2*10^-8 < ulp(PI) = 2*10^-7 * sinh(0.5f) -> error of < 10^-10 < ulp(0.5f) = 5*10^-8 * sinh(0.0f) -> error of < 10^-48 < ulp(0.0f) = 1*10^-45 * * OpenCL 1.2 EMBEDDED PROFILE allows an error of up to 4 ulp */ // TODO inaccurate out of the range of -PI <= x <= PI /* * Alternatives: * sinh(x) = sqrt((cosh(2x)-1)/2) * sinh(x) = exp(x) - cosh(x) */ const result_t fac3 = 1.0f / factorial(3); const result_t fac5 = 1.0f / factorial(5); const result_t fac7 = 1.0f / factorial(7); const result_t fac9 = 1.0f / factorial(9); const result_t fac11 = 1.0f / factorial(11); const result_t fac13 = 1.0f / factorial(13); const result_t fac15 = 1.0f / factorial(15); const result_t fac17 = 1.0f / factorial(17); result_t x = val; /// TODO optimize!! /* while(x < -M_PI_F) { x = x + 2 * M_PI_F; } while(x > M_PI_F) { x = x - 2 * M_PI_F; } */ result_t x3 = x * x * x; result_t tmp1 = x3 * fac3; // sinh(x) = x + x^3/3! + ... result_t tmp = x + tmp1; result_t x5 = x3 * x * x; tmp1 = x5 * fac5; //... + x^5/5! + ... tmp = x + tmp1; result_t x7 = x5 * x * x; tmp1 = x7 * fac7; //... + x^7/7! + ... tmp = tmp + tmp1; result_t x9 = x7 * x * x; tmp1 = x9 * fac9; //... + x^9/9! + ... tmp = tmp + tmp1; result_t x11 = x9 * x * x; tmp1 = x11 * fac11; //... + x^11/11! + ... tmp = tmp + tmp1; result_t x13 = x11 * x * x; tmp1 = x13 * fac13; //... + x^13/13! + ... tmp = tmp + tmp1; result_t x15 = x13 * x * x; tmp1 = x15 * fac15; //... + x^15/15! + ... tmp = tmp + tmp1; result_t x17 = x15 * x * x; tmp1 = x17 * fac17; //... + x^17/17! tmp = tmp + tmp1; return tmp; }) /** * Expected behavior: * * sinpi(+-0) = +-0 * sinpi(n) = 0 for n integer and n > 0 * sinpi(n) = -0 for n integer and n < 0 * sinpi(+-Inf) = NaN */ SIMPLE_1(float, sinpi, float, val, sin(val *M_PI_F)) COMPLEX_1(float, sqrt, float, x, { // Using Goldschmidt's algorithm with 2 iterations // see http://www.informatik.uni-trier.de/Reports/TR-08-2004/rnc6_12_markstein.pdf (formula 9) arg_t y = vc4cl_sfu_rsqrt(x); arg_t g = x * y; arg_t h = y * (arg_t)0.5f; for(unsigned i = 0; i < 2; ++i) { arg_t r = (arg_t)0.5f - g * h; g = g + g * r; h = h + h * r; } // sqrt(NaN) = NaN, sqrt(Inf) = Inf result_t result = vc4cl_is_inf_nan(x) ? x : g; // sqrt(+-0) = +-0 result = vc4cl_is_zero(x) ? x : result; // sqrt(x) = NaN, for x < 0 return x < 0.0f ? (result_t)nan(0) : result; }) /** * Expected behavior: * * tan(+-0) = +-0 * tan(+-Inf) = NaN */ // TODO different algorithm?! // Taylor Series is too inaccurate (converges too slow) SIMPLE_1(float, tan, float, val, sin(val) / cos(val)) /** * Expected behavior: * * tanh(+-0) = +-0 * tanh(+-Inf) = +-Inf */ // TODO different algorithm?! // Taylor Series is too inaccurate (converges too slow) SIMPLE_1(float, tanh, float, val, sinh(val) / cosh(val)) /** * Expected behavior: * * tanpi(+-0) = +-0 * tanpi(+-Inf) = NaN * tanpi(n) = copysign(0,n) for n even and n integer * tanpi(n) = copysign(0, -n) for n odd and n integer * tanpi(x) = +Inf for x = n + 0.5 and n even integer * tanpi(x) = -Inf for x = n + 0.5 and n odd integer */ SIMPLE_1(float, tanpi, float, val, tan(val *M_PI_F)) /** * Expected behavior: * * tgamma(+-0) = +-Inf * tgamma(x) = NaN for x < 0 and x integer * tgamma(-Inf) = NaN * tgamma(+Inf) = +Inf */ // TODO different algorithm?! E.g. http://www.math.hkbu.edu.hk/support/aands/page_257.htm with argument reduction // (previous page) SIMPLE_1(float, tgamma, float, val, exp(lgamma(val))) /** * Expected behavior: * * trunc(+-0) = +-0 * trunc(+-Inf) = +-Inf * trunc(x) = -0 for -1 < x < 0 */ COMPLEX_1(float, trunc, float, val, { //" Round to integer toward zero. " // https://stackoverflow.com/questions/12279914/implement-ceil-in-c // http://blog.frama-c.com/index.php?post/2013/05/02/nearbyintf1 int_t tooBig = isnan(val) || fabs(val) >= 0x1.0p23f; // |val| < 2^23 fits into integer result_t truncated = vc4cl_itof(vc4cl_ftoi(val)); // deciding here which value to return saves us up to two jumps return tooBig ? val : truncated; }) /* * native functions */ SIMPLE_1(float, native_cos, float, val, cos(val)) SIMPLE_2(float, native_divide, float, x, float, y, x *native_recip(y)) // e^x = 2^(x * log2(e)) SIMPLE_1(float, native_exp, float, x, native_exp2(x *M_LOG2E_F)) SIMPLE_1(float, native_exp2, float, val, vc4cl_sfu_exp2(val)) // 10^x = 2^(x * log2(10)) SIMPLE_1(float, native_exp10, float, x, native_exp2(x *M_LOG210)) // log(x) = log2(x) / log2(e) SIMPLE_1(float, native_log, float, x, native_log2(x) * (1.0f / M_LOG2E_F)) SIMPLE_1(float, native_log2, float, val, vc4cl_sfu_log2(val)) // log10(x) = log2(x) / log2(10) SIMPLE_1(float, native_log10, float, x, native_log2(x) * (1.0f / M_LOG210)) //"Compute x to the power y, where x is >= 0." // x^y = 2^(y * log2(x)) SIMPLE_2(float, native_powr, float, x, float, y, native_exp2(y *native_log2(x))) SIMPLE_1(float, native_recip, float, val, vc4cl_sfu_recip(val)) SIMPLE_1(float, native_rsqrt, float, val, vc4cl_sfu_rsqrt(val)) SIMPLE_1(float, native_sin, float, val, sin(val)) SIMPLE_1(float, native_sqrt, float, val, val * vc4cl_sfu_rsqrt(val)) SIMPLE_1(float, native_tan, float, val, tan(val)) /* * half functions, accuracy of 8192 ULP (11 MSB of the mantissa) */ SIMPLE_1(float, half_cos, float, val, cos(val)) SIMPLE_2(float, half_divide, float, x, float, y, x * vc4cl_sfu_recip(y)) SIMPLE_1(float, half_exp, float, val, native_exp(val)) COMPLEX_1(float, half_exp2, float, val, { result_t result = vc4cl_sfu_exp2(val); result = vc4cl_is_zero(val) ? (result_t)1.0f : result; result = val < (result_t)-126.0f ? (result_t)0.0f : result; result = val >= (result_t)128.0f ? (result_t)INFINITY : result; return isnan(val) ? val : result; }) SIMPLE_1(float, half_exp10, float, val, native_exp10(val)) SIMPLE_1(float, half_log, float, val, native_log(val)) COMPLEX_1(float, half_log2, float, val, { /* log2(x) = log2(M * e^E) = E + log2(M) */ int_t bitcast = vc4cl_bitcast_int(val); /* deduct exponent offset, we use -126 instead of -127, since we go into the range [0.5, 1), not [1, 2) */ int_t exponent = ((bitcast >> 23) & 0xFF) - 126; /* mask off exponent and replace with exponent for range [0.5, 1) */ int_t tmp = (bitcast & (int_t)0x807FFFFF) | (int_t)0x3F000000; arg_t mantissa = vc4cl_bitcast_float(tmp); result_t logMantissa = vc4cl_sfu_log2(mantissa); result_t result = vc4cl_itof(exponent) + logMantissa; result = vc4cl_is_inf_nan(val) ? val : result; result = signbit(val) ? (result_t)nan(0) : result; result = vc4cl_is_zero(val) ? (result_t)-INFINITY: result; return result; }) SIMPLE_1(float, half_log10, float, val, native_log10(val)) SIMPLE_2(float, half_powr, float, x, float, y, powr(x, y)) SIMPLE_1(float, half_recip, float, val, (arg_t)1.0f / val) COMPLEX_1(float, half_rsqrt, float, val, { arg_t x = vc4cl_sfu_rsqrt(val); // rsqrt(+-0) = +-Inf result_t result = vc4cl_is_zero(val) ? copysign((result_t)INFINITY, val) : x; // rsqrt(NaN) = NaN result = isnan(val) ? val : result; // rsqrt(Inf) = 0 result = isinf(val) ? (result_t)0.0f : result; // rsqrt(x) for x < 0 = -NaN result = val < (result_t)0.0f ? copysign((result_t)nan(0), -1.0f) : result; return result; }) SIMPLE_1(float, half_sin, float, val, sin(val)) COMPLEX_1(float, half_sqrt, float, val, { arg_t x = val * vc4cl_sfu_rsqrt(val); // sqrt(NaN) = NaN, sqrt(Inf) = Inf result_t result = vc4cl_is_inf_nan(val) ? val : x; // sqrt(+-0) = +-0 result = vc4cl_is_zero(val) ? val : result; // sqrt(x) = NaN, for x < 0 return val < 0.0f ? (result_t)nan(0) : result; }) SIMPLE_1(float, half_tan, float, val, tan(val)) #endif /* VC4CL_MATH_H */