Browse Source

VC4Stdlib

main
Raziel K. Crowe 2 years ago
parent
commit
1c4c363d5c
  1. 60
      drivers/videocore4_stdlib/CMakeLists.txt
  2. 21
      drivers/videocore4_stdlib/LICENSE
  3. 4
      drivers/videocore4_stdlib/Readme.md
  4. 7
      drivers/videocore4_stdlib/experimental/CMakeLists.txt
  5. 91
      drivers/videocore4_stdlib/experimental/cbrt.cl
  6. 404
      drivers/videocore4_stdlib/experimental/compare_implementations.cpp
  7. 364
      drivers/videocore4_stdlib/experimental/exp.cl
  8. 46
      drivers/videocore4_stdlib/experimental/fma.cl
  9. 9
      drivers/videocore4_stdlib/experimental/identity.cl
  10. 256
      drivers/videocore4_stdlib/experimental/log.cl
  11. 77
      drivers/videocore4_stdlib/include/VC4CLStdLib.h
  12. 245
      drivers/videocore4_stdlib/include/_async.h
  13. 659
      drivers/videocore4_stdlib/include/_atomics.h
  14. 411
      drivers/videocore4_stdlib/include/_clcxx_mangling.h
  15. 101
      drivers/videocore4_stdlib/include/_common.h
  16. 30
      drivers/videocore4_stdlib/include/_config.h
  17. 1861
      drivers/videocore4_stdlib/include/_conversions.h
  18. 173
      drivers/videocore4_stdlib/include/_extensions.h
  19. 121
      drivers/videocore4_stdlib/include/_float_float.h
  20. 93
      drivers/videocore4_stdlib/include/_geometric.h
  21. 1016
      drivers/videocore4_stdlib/include/_images.h
  22. 233
      drivers/videocore4_stdlib/include/_integer.h
  23. 436
      drivers/videocore4_stdlib/include/_intrinsics.h
  24. 1666
      drivers/videocore4_stdlib/include/_math.h
  25. 819
      drivers/videocore4_stdlib/include/_overloads.h
  26. 43
      drivers/videocore4_stdlib/include/_printf.h
  27. 341
      drivers/videocore4_stdlib/include/_relational.h
  28. 1716
      drivers/videocore4_stdlib/include/_spir_mangling.h
  29. 24
      drivers/videocore4_stdlib/include/_synchronization.h
  30. 265
      drivers/videocore4_stdlib/include/_vector.h
  31. 70
      drivers/videocore4_stdlib/include/_work_items.h
  32. 105
      drivers/videocore4_stdlib/include/defines.h
  33. 16914
      drivers/videocore4_stdlib/include/opencl-c.h

60
drivers/videocore4_stdlib/CMakeLists.txt

@ -0,0 +1,60 @@
cmake_minimum_required (VERSION 3.1)
####
# General configuration
####
# Option whether to create deb package
option(BUILD_DEB_PACKAGE "Enables creating .deb package" ON)
# Option whether to compile for raspberry-pi (default: ON, for the compatibility)
option(CROSS_COMPILE "Cross compile for Raspbian" ON)
option(BUILD_EXPERIMENTAL "Build experimental test program" OFF)
if(NOT BUILD_NUMBER)
set(BUILD_NUMBER 9999)
endif()
project(VC4CLStdLib VERSION 0.4.${BUILD_NUMBER})
#Include headers in the project structure
file( GLOB HDRS "${PROJECT_SOURCE_DIR}/include/*.h")
add_library(VC4CLStdLib STATIC ${HDRS})
set_target_properties(VC4CLStdLib PROPERTIES LINKER_LANGUAGE C)
##
# Installation targets
##
# Adds the public headers to the target, so they are exported
target_include_directories(VC4CLStdLib PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include/vc4cl-stdlib>)
# Creates the install target for the headers
install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/" DESTINATION include/vc4cl-stdlib FILES_MATCHING PATTERN "*.h")
# Adds custom uninstall command
add_custom_target(uninstall "${CMAKE_COMMAND}" -P "cmake_uninstall.cmake")
if (BUILD_EXPERIMENTAL)
add_subdirectory(experimental)
endif (BUILD_EXPERIMENTAL)
####
# Building package
####
if (BUILD_DEB_PACKAGE)
message(STATUS "build deb package...")
set(CPACK_GENERATOR "DEB")
set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
set(CPACK_PACKAGE_NAME "vc4cl-stdlib")
string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%d")
set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}-${BUILD_TIMESTAMP}")
if (CROSS_COMPILE)
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "armhf")
else()
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
endif()
set(CPACK_PACKAGE_VENDOR "doe300")
set(CPACK_PACKAGE_CONTACT "[email protected]")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL C headers for the VC4CL implementation (raspberrypi only)")
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/doe300/VC4CLStdLib")
set(CPACK_PACKAGE_FILE_NAME "vc4cl-stdlib-0.4-Linux")
include(CPack)
endif (BUILD_DEB_PACKAGE)

21
drivers/videocore4_stdlib/LICENSE

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

4
drivers/videocore4_stdlib/Readme.md

@ -0,0 +1,4 @@
# VC4CLStdLib
Implementation of the OpenCL standard-library and is required to build the [VC4C] compiler.

7
drivers/videocore4_stdlib/experimental/CMakeLists.txt

@ -0,0 +1,7 @@
find_package(OpenCL REQUIRED)
find_package(Threads REQUIRED)
add_executable(compare_implementations compare_implementations.cpp)
target_compile_features(compare_implementations PRIVATE cxx_std_14)
target_compile_options(compare_implementations PRIVATE -g -Og)
target_link_libraries(compare_implementations OpenCL::OpenCL Threads::Threads)

91
drivers/videocore4_stdlib/experimental/cbrt.cl

@ -0,0 +1,91 @@
#define arg_t float16
#define result_t float16
#define int_t int16
#define uint_t uint16
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
result_t approx_rootn(arg_t x, int_t n)
{
// Divides the exponent by n and emplaces it back into the number
// Adapted from: https://web.archive.org/web/20131227144655/http://metamerist.com/cbrt/cbrt.htm
int_t i = CAT(as_, int_t)(x);
int_t exp = (i - (int_t) (127 << 23)) / n + (int_t) (127 << 23);
return CAT(as_, result_t)((i & (int_t) 0x807FFFFF) | (exp));
}
result_t approx_cbrt(arg_t f)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
uint_t p = CAT(as_, uint_t)(f);
p = p / 3 + 709921077;
return CAT(as_, result_t)(p);
}
result_t cbrt_halley_step(arg_t x, arg_t base)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
result_t x3 = x * x * x;
return x * (x3 + base + base) / (x3 + x3 + base);
}
result_t cbrt_halley(arg_t val)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
arg_t abs = fabs(val);
arg_t approx = approx_rootn(abs, 3);
result_t result = approx;
#pragma loop unroll
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
{
result = cbrt_halley_step(result, val);
}
return copysign(result, val);
}
__kernel void cbrt_halley_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt_halley(in[gid]);
}
result_t cbrt_newton_step(arg_t x, arg_t base)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
return x - (1.0f / 3.0f) * (x - base / (x * x));
}
result_t cbrt_newton(arg_t val)
{
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
arg_t abs = fabs(val);
arg_t approx = approx_cbrt(abs);
result_t result = approx;
#pragma loop unroll
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
{
result = cbrt_newton_step(result, val);
}
return copysign(result, val);
}
__kernel void cbrt_newton_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt_newton(in[gid]);
}
__kernel void cbrt_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = cbrt(in[gid]);
}
__kernel void cbrt_pow_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = pow(in[gid], 1.0f / 3.0f);
}

404
drivers/videocore4_stdlib/experimental/compare_implementations.cpp

@ -0,0 +1,404 @@
#define CL_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
#define CL_HPP_ENABLE_EXCEPTIONS 1
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include <CL/cl.hpp>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <limits>
#include <random>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unistd.h> // geteuid()
#include <vector>
static constexpr uint32_t DEFAULT_NUM_LINEAR = 12 * 16 * 8;
static constexpr uint32_t DEFAULT_NUM_RANDOM = 12 * 16 * 8;
// VC4CL performance counters
#define CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 10)
#define CL_PROFILING_PERFORMANCE_COUNTER_IDLE_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 11)
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL (CL_PROFILING_COMMAND_END + 12)
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 13)
#define CL_PROFILING_PERFORMANCE_COUNTER_L2_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 14)
struct Range
{
float min;
float max;
};
struct ReferenceFunction
{
ReferenceFunction(float (*func)(float)) : numParameters(1), ptr(reinterpret_cast<void *>(func)) {}
ReferenceFunction(float (*func)(float, float)) : numParameters(2), ptr(reinterpret_cast<void *>(func)) {}
ReferenceFunction(float (*func)(float, float, float)) : numParameters(3), ptr(reinterpret_cast<void *>(func)) {}
float operator()(float val) const
{
if(numParameters != 1)
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
return reinterpret_cast<float (*)(float)>(ptr)(val);
}
float operator()(float val0, float val1) const
{
if(numParameters != 2)
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
return reinterpret_cast<float (*)(float, float)>(ptr)(val0, val1);
}
float operator()(float val0, float val1, float val2) const
{
if(numParameters != 3)
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
return reinterpret_cast<float (*)(float, float, float)>(ptr)(val0, val1, val2);
}
std::vector<float> operator()(const std::vector<std::vector<float>> &inputs) const
{
std::vector<float> out(inputs.front().size());
for(std::size_t i = 0; i < out.size(); ++i)
{
if(numParameters == 1)
out[i] = (*this)(inputs[0][i]);
if(numParameters == 2)
out[i] = (*this)(inputs[0][i], inputs[1][i]);
if(numParameters == 3)
out[i] = (*this)(inputs[0][i], inputs[1][i], inputs[2][i]);
}
return out;
}
uint8_t numParameters;
void *ptr;
};
struct Test
{
std::string name;
ReferenceFunction reference;
uint32_t allowedErrorInUlp;
std::string sourceFile;
std::vector<Range> ranges;
};
static float identity(float val)
{
return val;
}
// XXX OpenCL-CTS calculates reference in double, thus is more accurate. So tests being accurate here might not be in
// the CTS!
static const std::vector<Test> floatTests = {
Test{"log", logf, 4, "log.cl",
{
{0.5, 1.0}, // reduced range some implementations use
{std::numeric_limits<float>::min(), std::numeric_limits<float>::max()} // full range
}},
Test{"exp", expf, 4, "exp.cl",
{
{0.0, 0.5f * logf(2.0f)}, // reduced range some implementations use
{-87.0f /* everything below e^-87 is subnormal */, 89.0f /* everything above e^89 is Inf */} // full range
}},
Test{"identity", identity, 0, "identity.cl",
{
{-10.0f, 10.0f}, {std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
}},
Test{"cbrt", cbrtf, 4, "cbrt.cl",
{
{-1.0, 1.0}, // limited range for precision testing
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
}},
Test{"fma", fmaf, 0, "fma.cl",
{
{-100.0f, 100.0f}, // reduced range to not run into NaN/Inf
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
}}};
static std::vector<float> generateInputData(const Range &range, uint32_t numLinear, uint32_t numRandom)
{
std::vector<float> data{};
data.reserve(numLinear + numRandom);
auto step = (range.max - range.min) / static_cast<float>(numLinear); // TODO overflows on full ranges
for(float val = range.min; val < range.max; val += step)
data.emplace_back(val);
std::random_device rd{};
std::default_random_engine gen(rd());
std::uniform_real_distribution<> dist{range.min, range.max};
while(data.size() < (numLinear + numRandom))
data.emplace_back(static_cast<float>(dist(gen)));
return data;
}
static std::vector<std::vector<float>> generateInputData(
const Range &range, uint32_t numLinear, uint32_t numRandom, uint8_t numInputs)
{
std::vector<std::vector<float>> data{};
for(uint8_t i = 0; i < numInputs; ++i)
data.emplace_back(generateInputData(range, numLinear, numRandom));
return data;
}
static std::vector<cl::Kernel> createKernels(const cl::Context &context, const Test &test)
{
std::stringstream ss;
{
std::ifstream fis{test.sourceFile};
ss << fis.rdbuf();
}
cl::Program program(context, ss.str(), true);
std::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
return kernels;
}
struct ErrorResult
{
std::vector<float> inputValues;
float expected;
float actual;
uint32_t errorInUlp;
// ordered by "most wrong" first
bool operator<(const ErrorResult &other) const noexcept
{
if(errorInUlp > other.errorInUlp)
return true;
if(errorInUlp < other.errorInUlp)
return false;
return inputValues < other.inputValues;
}
friend std::ostream &operator<<(std::ostream &os, const ErrorResult &error)
{
os << "Error of " << error.errorInUlp << " ULP for ";
if(error.inputValues.size() == 1)
os << std::scientific << error.inputValues.front();
else if(error.inputValues.size() == 2)
os << std::scientific << '{' << error.inputValues.front() << ", " << error.inputValues.back() << '}';
else if(error.inputValues.size() == 3)
os << std::scientific << '{' << error.inputValues[0] << ", " << error.inputValues[1] << ", "
<< error.inputValues[2] << '}';
else
{
os << '{';
for(auto input : error.inputValues)
os << std::scientific << input << ", ";
os << '}';
}
os << ", expected " << error.expected << ", got " << error.actual << std::defaultfloat;
return os;
}
};
template <typename Out, typename In>
static Out bit_cast(In val)
{
union
{
In in;
Out out;
} u;
u.in = val;
return u.out;
}
static uint32_t calculateError(float reference, float result, uint32_t allowedErrorInUlp)
{
if(std::isinf(reference) && std::isinf(result) && std::signbit(reference) == std::signbit(result))
return 0;
if(std::isnan(reference) && std::isnan(result))
return 0;
// auto ulp = std::abs(reference * std::numeric_limits<float>::epsilon());
// float difference = std::abs(result - reference);
// if(difference > static_cast<float>(allowedErrorInUlp))
// return static_cast<uint32_t>(std::ceil(difference / ulp));
// return 0;
return static_cast<uint32_t>(std::abs(bit_cast<int32_t>(reference) - bit_cast<int32_t>(result)));
}
static std::pair<std::vector<ErrorResult>, uint32_t> checkResults(const std::vector<std::vector<float>> &inputs,
const std::vector<float> &reference, const std::vector<float> &result, uint32_t allowedErrorInUlp)
{
std::vector<ErrorResult> errors;
uint32_t maxError = 0;
for(std::size_t i = 0; i < std::min(reference.size(), result.size()); ++i)
{
auto error = calculateError(reference.at(i), result.at(i), allowedErrorInUlp);
maxError = std::max(maxError, error);
if(error > allowedErrorInUlp)
{
std::vector<float> errorInputs;
for(const auto &input : inputs)
errorInputs.push_back(input.at(i));
errors.emplace_back(ErrorResult{std::move(errorInputs), reference.at(i), result.at(i), error});
}
}
std::sort(errors.begin(), errors.end());
return std::make_pair(std::move(errors), maxError);
}
static void runTest(
const cl::Context &context, const cl::CommandQueue &queue, const Test &test, uint32_t numLinear, uint32_t numRandom)
{
std::cout << "Running test " << test.sourceFile << " ..." << std::endl;
std::cout << "\tRunning " << test.ranges.size() << " ranges with " << (numLinear + numRandom) << " values"
<< std::endl;
auto kernels = createKernels(context, test);
std::cout << "\tTesting " << kernels.size() << " implementations " << std::endl;
for(const auto &range : test.ranges)
{
auto inputs = generateInputData(range, numLinear, numRandom, test.reference.numParameters);
auto inputSize = inputs.front().size();
cl::NDRange globalSize(inputSize / 16);
std::vector<float> reference = test.reference(inputs);
std::vector<cl::Buffer> inputBuffers;
for(auto &input : inputs)
inputBuffers.emplace_back(queue, input.begin(), input.end(), true);
cl::Buffer outputBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, inputSize * sizeof(float));
for(auto &kernel : kernels)
{
kernel.setArg(0, outputBuffer);
for(std::size_t i = 0; i < inputBuffers.size(); ++i)
kernel.setArg(1 + i, inputBuffers[i]);
std::cout << "\tRunning kernel '" << kernel.getInfo<CL_KERNEL_FUNCTION_NAME>() << "' with "
<< (inputSize / 16) << " work-items ... " << std::endl;
auto start = std::chrono::steady_clock::now();
cl::Event kernelEvent{};
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, cl::NullRange, nullptr, &kernelEvent);
kernelEvent.wait();
auto end = std::chrono::steady_clock::now();
std::cout << "\t- Finished in "
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us"
<< std::endl;
std::chrono::nanoseconds deviceDuration{kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()};
std::cout << "\t- Executed for "
<< std::chrono::duration_cast<std::chrono::microseconds>(deviceDuration).count() << " us"
<< std::endl;
if(geteuid() == 0) // TODO only on hardware
{
cl_ulong numInstructions = 0;
kernelEvent.getProfilingInfo(
CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL, &numInstructions);
cl_ulong numCycles = 0;
kernelEvent.getProfilingInfo(CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL, &numCycles);
std::cout << "\t- Executed " << numInstructions << " instructions in " << numCycles << " cycles"
<< std::endl;
}
std::vector<float> result(inputSize);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, inputSize * sizeof(float), result.data());
auto errors = checkResults(inputs, reference, result, test.allowedErrorInUlp);
std::cout << "\t- Has " << errors.first.size() << " wrong results and a maximum error of " << errors.second
<< " ULP (of allowed " << test.allowedErrorInUlp << " ULP)" << std::endl;
for(std::size_t i = 0; i < std::min(errors.first.size(), std::size_t{8}); ++i)
std::cout << "\t\t" << errors.first[i] << std::endl;
if(errors.first.size() > 8)
std::cout << "\t\t[...]" << std::endl;
}
}
}
static void printHelp()
{
std::cout << "Usage: <program> [<options>] <test> [<test>...]" << std::endl;
std::cout << "Options: " << std::endl;
std::cout << "\t--help Shows this help message" << std::endl;
std::cout << "\t--linear=<num> Specifies the number of linear test values, defaults to " << DEFAULT_NUM_LINEAR
<< std::endl;
std::cout << "\t--random=<num> Specifies the number of random test values, defaults to " << DEFAULT_NUM_RANDOM
<< std::endl;
std::cout << "Available tests: ";
for(const auto &test : floatTests)
std::cout << test.name << ", ";
std::cout << std::endl;
}
int main(int argc, char **argv)
{
uint32_t numLinear = DEFAULT_NUM_LINEAR;
uint32_t numRandom = DEFAULT_NUM_RANDOM;
if(argc < 2)
{
printHelp();
return EXIT_SUCCESS;
}
auto platform = cl::Platform::get();
cl::Device device{};
{
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty())
{
std::cout << "No device found!" << std::endl;
return EXIT_FAILURE;
}
device = devices.front();
}
cl::Context context(device);
cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE);
std::vector<std::reference_wrapper<const Test>> selectedTests;
for(int i = 1; i < argc; ++i)
{
if(argv[i][0] == '-')
{
if(std::string{"--help"} == argv[i])
{
printHelp();
return EXIT_SUCCESS;
}
else if(strstr(argv[i], "--linear=") == argv[i])
numLinear = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--linear=")));
else if(strstr(argv[i], "--random=") == argv[i])
numRandom = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--random=")));
else
{
std::cout << "Unknown option: " << argv[i] << std::endl;
printHelp();
return EXIT_FAILURE;
}
}
auto testIt =
std::find_if(floatTests.begin(), floatTests.end(), [&](const Test &test) { return test.name == argv[i]; });
if(testIt != floatTests.end())
selectedTests.emplace_back(std::cref(*testIt));
else
{
std::cout << "No such test '" << argv[i] << "', available tests: ";
for(const auto &test : floatTests)
std::cout << test.name << ", ";
std::cout << std::endl;
return EXIT_FAILURE;
}
}
for(const auto &test : selectedTests)
runTest(context, queue, test.get(), numLinear, numRandom);
return EXIT_SUCCESS;
}

364
drivers/videocore4_stdlib/experimental/exp.cl

@ -0,0 +1,364 @@
#define arg_t float16
#define result_t float16
#define int_t int16
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
// vc4cl_split(double) of M_LN2
#define M_LN2_FF 0xB102E3083F317218
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
result_t pow2(int_t val)
{
// y = 2^x = 1.0 [implied] * 2^(x + offset)
int_t tmp = val << 23;
// alternative: tmp = (val + 127) << 23;
tmp += (int_t) 0x3F800000;
return CAT(as_, result_t)(tmp & (int_t) 0x7F800000);
}
int_t powerOfTwo(arg_t val)
{
// Original code, produces Inf for e^(~10^38)
// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f));
// Using floor() instead of ceil(),
// - fixes Inf for large exponents
// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP),
// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations
return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f));
}
/*
* Taylor series with Horner's method and range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-6
*/
result_t exp_taylor(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t tk = 1.0f;
arg_t tn = 1.0f;
#pragma loop unroll
for(int i = 1; i < 10; i++) // TODO can adjust number of iterations
{
tk *= r / i;
tn += tk;
};
tn = tn * pow2(k);
return val < 0 ? 1 / tn : tn;
}
__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_taylor(in[gid]);
}
result_t exp_taylor_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 tk = 0x000000003F800000; // 1.0
ulong16 tn = 0x000000003F800000; // 1.0
tk = vc4cl_mul(tk, r);
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8
tn = vc4cl_add(tn, tk);
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9
tn = vc4cl_add(tn, tk);
// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP)
result_t result = vc4cl_lossy(tn) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_taylor_extended_precision_exact(in[gid]);
// }
// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp
/*
* Chebyshev interpolation with range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-18
*/
result_t exp_chebyshev(arg_t val)
{
// XXX could remove unneeded coefficients once we fix precision
const float coefficients[] = {
1.266065877752008335598244625214717537923,
1.130318207984970054415392055219726613610,
0.2714953395340765623657051399899818507081,
0.04433684984866380495257149525979922986386,
0.00547424044209373265027616843118645948703,
0.000542926311913943750362147810307554678760,
0.00004497732295429514665469032811091269841937,
3.198436462401990505863872976602295688795e-6,
1.992124806672795725961064384805589035648e-7,
1.103677172551734432616996091335324170860e-8,
5.50589607967374725047142040200552692791e-10,
2.497956616984982522712010934218766985311e-11,
1.039152230678570050499634672423840849837e-12,
3.991263356414401512887720401532162026594e-14,
};
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t ti = 1.0f;
arg_t tj = r;
arg_t p = coefficients[0] + (coefficients[1] * r);
#pragma loop unroll
for(int i = 2; i < 8; i++) // TODO can adjust number of iterations
{
arg_t tk = (2 * r * tj) - ti;
p += coefficients[i] * tk;
ti = tj;
tj = tk;
}
p = p * pow2(k);
return val < 0 ? 1 / p : p;
}
__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev(in[gid]);
}
result_t exp_chebyshev_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 ti = 0x000000003F800000; // 1.0
ulong16 tj = r;
// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610
ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r));
r = vc4cl_mul(r, 0x0000000040000000); // 2.0
ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6
ti = tj;
tj = tk;
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7
ti = tj;
tj = tk;
// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP)
result_t result = vc4cl_lossy(p) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_chebyshev_extended_precision_exact(in[gid]);
// }
/*
* Chebyshev interpolation with monomial basis and range reduction,
*
* https://www.pseudorandom.com/implementing-exp#section-18
*/
result_t exp_chebyshev_monomial(arg_t val)
{
// XXX could remove unneeded coefficients once we fix precision
// TODO invert order of coefficients and traversal ?!
const float coefficients[] = {
1.000000000000000,
1.000000000000000,
0.500000000000002,
0.166666666666680,
0.041666666666727,
0.008333333333342,
0.001388888888388,
1.984126978734782e-4,
2.480158866546844e-5,
2.755734045527853e-6,
2.755715675968011e-7,
2.504861486483735e-8,
2.088459690899721e-9,
1.632461784798319e-10,
};
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
arg_t pn = 1.143364767943110e-11;
#pragma loop unroll
for(int i = 0; i < 14; i++)
{
pn = pn * r + coefficients[13 - i];
}
pn = pn * pow2(k);
return val < 0 ? 1 / pn : pn;
}
__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev_monomial(in[gid]);
}
result_t exp_chebyshev_monomial_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
arg_t kFloat = CAT(convert_, arg_t)(k);
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
arg_t pn = 1.143364767943110e-11;
pn = pn * r + 1.632461784798319e-10f;
pn = pn * r + 2.088459690899721e-9f;
pn = pn * r + 2.504861486483735e-8f;
pn = pn * r + 2.755715675968011e-7f;
pn = pn * r + 2.755734045527853e-6f;
pn = pn * r + 2.480158866546844e-5f;
pn = pn * r + 1.984126978734782e-4f;
pn = pn * r + 0.001388888888388f;
pn = pn * r + 0.008333333333342f;
pn = pn * r + 0.041666666666727f;
pn = pn * r + 0.166666666666680f;
pn = pn * r + 0.500000000000002f;
pn = pn * r + 1.000000000000000f;
pn = pn * r + 1.000000000000000f;
pn = pn * pow2(k);
return val < 0 ? 1 / pn : pn;
}
__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp_chebyshev_monomial_exact(in[gid]);
}
result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val)
{
arg_t positive = fabs(val);
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
int_t k = powerOfTwo(positive);
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
ulong16 pn = 0x209249252D492492; // 1.143364767943110e-11
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10
pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680
pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
result_t result = vc4cl_lossy(pn) * pow2(k);
return val < 0 ? 1.0f / result : result;
}
// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
// {
// uint gid = get_global_id(0);
// out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]);
// }
// TODO Remes from www.netlib.org/fdlibm/e_exp.c
// TODO Matters computational (sections 32.2.2.2 and 32.2.3)
// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4)
// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1
// TODO http://www.netlib.org/fdlibm/
__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = exp(in[gid]);
}

46
drivers/videocore4_stdlib/experimental/fma.cl

@ -0,0 +1,46 @@
#define arg_t float16
#define result_t float16
#define int_t int16
#define uint_t uint16
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
result_t fma_simple(arg_t in0, arg_t in1, arg_t in2)
{
return in0 * in1 * in2;
}
__kernel void fma_simple_kernel(
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
{
uint gid = get_global_id(0);
out[gid] = fma_simple(in0[gid], in1[gid], in2[gid]);
}
result_t fma_extended_precision(arg_t in0, arg_t in1, arg_t in2)
{
ulong16 mul = vc4cl_mul(in0, in1);
return vc4cl_lossy(vc4cl_add(mul, vc4cl_extend(in2)));
}
__kernel void fma_extended_precision_kernel(
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
{
uint gid = get_global_id(0);
out[gid] = fma_extended_precision(in0[gid], in1[gid], in2[gid]);
}
__kernel void fma_builtin_kernel(
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
{
uint gid = get_global_id(0);
out[gid] = fma(in0[gid], in1[gid], in2[gid]);
}

9
drivers/videocore4_stdlib/experimental/identity.cl

@ -0,0 +1,9 @@
#define arg_t float16
#define result_t float16
#define int_t int16
__kernel void identity_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = in[gid];
}

256
drivers/videocore4_stdlib/experimental/log.cl

@ -0,0 +1,256 @@
#define arg_t float16
#define result_t float16
#define int_t int16
/*
* Helper, arithmetic-geometric-mean,
*
* https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean
*/
result_t agm(arg_t x, arg_t y)
{
arg_t arithm = x;
arg_t geom = y;
for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations
{
arg_t arithm_new = (arithm + geom) / (arg_t) 2.0;
geom = sqrt(arithm * geom);
arithm = arithm_new;
}
return arithm;
}
#define CONCAT(a, b) a##b
#define CAT(a, b) CONCAT(a, b)
#define REDUCE_ARGUMENT_TO_0_1 \
/* log(S * M * 2^E) = log(S * M) + E log(2) */ \
int_t bitcast = CAT(as_, int_t)(val); \
/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */ \
int_t exponent = ((bitcast >> 23) & 0xFF) - 126; \
/* mask off exponent and replace with exponent for range [0.5, 1) */ \
int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000; \
arg_t mantissa = CAT(as_, result_t)(signedMantissaBits); \
result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F;
/*
* Taylor-series,
*
* https://en.wikipedia.org/wiki/Mercator_series
*/
result_t log1p_taylor(arg_t val)
{
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
// converges for -1 < x <= 1 (requires argument reduction)
REDUCE_ARGUMENT_TO_0_1
// iteration 1
result_t result = mantissa;
arg_t power = mantissa;
#pragma loop unroll
for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations
{
power *= mantissa;
arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0;
result = result + sign * (arg_t) (1.0 / iteration) * power;
}
return result + reduced;
}
__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f);
}
result_t log1p_taylor_unrolled(arg_t val)
{
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
// converges for -1 < x <= 1 (requires argument reduction)
REDUCE_ARGUMENT_TO_0_1
// iteration 1
result_t result = mantissa;
arg_t power = mantissa;
// iteration 2
power *= mantissa;
result = result - (arg_t) (1.0 / 2.0) * power;
// iteration 3
power *= mantissa;
result = result + (arg_t) (1.0 / 3.0) * power;
// iteration 4
power *= mantissa;
result = result - (arg_t) (1.0 / 4.0) * power;
// iteration 5
power *= mantissa;
result = result + (arg_t) (1.0 / 5.0) * power;
// iteration 6
power *= mantissa;
result = result - (arg_t) (1.0 / 6.0) * power;
// iteration 7
power *= mantissa;
result = result + (arg_t) (1.0 / 7.0) * power;
// iteration 8
power *= mantissa;
result = result - (arg_t) (1.0 / 8.0) * power;
// iteration 9
power *= mantissa;
result = result + (arg_t) (1.0 / 9.0) * power;
// iteration 10
power *= mantissa;
result = result - (arg_t) (1.0 / 10.0) * power;
// iteration 11
power *= mantissa;
result = result + (arg_t) (1.0 / 11.0) * power;
// iteration 12
power *= mantissa;
result = result - (arg_t) (1.0 / 12.0) * power;
// iteration 13
power *= mantissa;
result = result + (arg_t) (1.0 / 13.0) * power;
// iteration 14
power *= mantissa;
result = result - (arg_t) (1.0 / 14.0) * power;
// iteration 15
power *= mantissa;
result = result + (arg_t) (1.0 / 15.0) * power;
// iteration 16
power *= mantissa;
result = result - (arg_t) (1.0 / 16.0) * power;
// iteration 17
power *= mantissa;
result = result + (arg_t) (1.0 / 17.0) * power;
// iteration 18
power *= mantissa;
result = result - (arg_t) (1.0 / 18.0) * power;
// iteration 19
power *= mantissa;
result = result + (arg_t) (1.0 / 19.0) * power;
// iteration 20
power *= mantissa;
result = result - (arg_t) (1.0 / 20.0) * power;
// iteration 21
power *= mantissa;
result = result + (arg_t) (1.0 / 21.0) * power;
// iteration 22
power *= mantissa;
result = result - (arg_t) (1.0 / 22.0) * power;
// iteration 23
power *= mantissa;
result = result + (arg_t) (1.0 / 23.0) * power;
// iteration 24
power *= mantissa;
result = result - (arg_t) (1.0 / 24.0) * power;
// iteration 25
power *= mantissa;
result = result + (arg_t) (1.0 / 25.0) * power;
// iteration 26
power *= mantissa;
result = result - (arg_t) (1.0 / 26.0) * power;
// TODO can adjust number of iterations
return result + reduced;
}
__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f);
}
/*
* Taylor series with optimization, requires argument reduction,
*
* https://math.stackexchange.com/a/3383716
*/
result_t log_taylor(arg_t val)
{
REDUCE_ARGUMENT_TO_0_1
result_t result = 0;
// iteration 1
arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0);
arg_t factor = tmp * tmp;
#pragma loop unroll
for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations
{
result += tmp / (arg_t) iteration;
tmp *= factor;
}
return result + reduced;
}
__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_taylor(in[gid]);
}
/*
* Using the arithmetic-geometric-mean,
*
* https://en.wikipedia.org/wiki/Natural_logarithm#High_precision
*/
result_t log_agm(arg_t val)
{
const unsigned m = 8; // TODO can adjust for precision
arg_t s = val * (arg_t) (1 << m);
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2);
}
__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_agm(in[gid]);
}
result_t log_agm_reduced(arg_t val)
{
REDUCE_ARGUMENT_TO_0_1
const unsigned m = 8; // TODO can adjust for precision
arg_t s = mantissa * (arg_t) (1 << m);
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced;
}
__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log_agm_reduced(in[gid]);
}
__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in)
{
uint gid = get_global_id(0);
out[gid] = log(in[gid]);
}

77
drivers/videocore4_stdlib/include/VC4CLStdLib.h

@ -0,0 +1,77 @@
/*
* General header for the VC4CLStdlib implementation, contains all required headers
*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CLSTDLIB_H
#define VC4CLSTDLIB_H
#ifdef __cplusplus
extern "C"
{
#endif
#include "_config.h"
#include "_extensions.h"
#include "_conversions.h"
#include "_common.h"
#include "_math.h"
#include "_integer.h"
#include "_geometric.h"
#include "_relational.h"
#include "_work_items.h"
#include "_vector.h"
#include "_synchronization.h"
#include "_async.h"
#include "_atomics.h"
#include "_images.h"
#include "_printf.h"
#include "_spir_mangling.h"
#include "_clcxx_mangling.h"
#undef ALL_BITS_SET
#undef OVERLOADABLE
#undef CONST
#undef PURE
#undef INLINE
#undef FUNC_1
#undef OVERLOAD_1
#undef OVERLOAD_1_RETURN_SCALAR
#undef FUNC_2
#undef OVERLOAD_2
#undef OVERLOAD_2_SCALAR
#undef OVERLOAD_2_RETURN_SCALAR
#undef OVERLOAD_2_SCALAR_RETURN_SCALAR
#undef FUNC_3
#undef OVERLOAD_3
#undef OVERLOAD_3_SCALAR
#undef FUNC_4
#undef FUNC_5
#undef SIMPLE_1
#undef SIMPLE_1_RETURN_SCALAR
#undef SIMPLE_2
#undef SIMPLE_2_RETURN_SCALAR
#undef SIMPLE_2_SCALAR
#undef SIMPLE_3
#undef SIMPLE_3_SCALAR
#undef SIMPLE_3_TWO_SCALAR
#undef COMPLEX_1
#undef COMPLEX_1_RETURN_SCALAR
#undef COMPLEX_2
#undef COMPLEX_3
#undef COMPLEX_3_SCALAR
#undef OVERLOAD_ALL_IMAGE_TYPES
#undef OVERLOAD_ALL_IMAGE_TYPES_1
#undef OVERLOAD_ALL_IMAGE_TYPES_2
#undef OVERLOAD_ALL_IMAGE_TYPES_3
#undef OVERLOAD_ALL_IMAGE_TYPES_4
#ifdef __cplusplus
}
#endif
#endif /* VC4CLSTDLIB_H */

245
drivers/videocore4_stdlib/include/_async.h

@ -0,0 +1,245 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_ASYNC_H
#define VC4CL_ASYNC_H
#include "_config.h"
#include "_overloads.h"
/*
* This is a synchronous/blocking implementation.
* The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area.
* Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item
* (index 0, 0, 0). Idea taken from PoCL
*/
#define ASYNC_COPY_INTERNAL \
if(vc4cl_local_id(0) == 0) \
{ \
vc4cl_mutex_lock(); \
vc4cl_dma_copy(dst, src, num_elements); \
vc4cl_mutex_unlock(); \
}
#define ASYNC_COPY(type) \
INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
{ \
ASYNC_COPY_INTERNAL \
return vc4cl_set_event(event); \
}
#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
for (size_t i = 0; i < num_elements; ++i) \
dst[i] = src[i * src_stride];
//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?
#define ASYNC_STRIDED_DEST_COPY_INTERNAL \
for (size_t i = 0; i < num_elements; ++i) \
dst[i * dst_stride] = src[i];
#define ASYNC_STRIDED_COPY(type) \
INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
} \
INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
{ \
ASYNC_STRIDED_DEST_COPY_INTERNAL \
return vc4cl_set_event(event); \
}
#define PREFETCH(type) \
INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
} \
INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
} \
INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
} \
INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
} \
INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
} \
INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
{ \
vc4cl_prefetch(ptr, num_entries); \
}
/*
* OpenCL 1.2, page 278:
* "Perform an async copy of num_gentypes gentype elements from src to dst.
* The async copy is performed by all work-items in a work-group and this built-in
* function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values."
*/
ASYNC_COPY(uchar)
ASYNC_COPY(char)
ASYNC_COPY(ushort)
ASYNC_COPY(short)
ASYNC_COPY(uint)
ASYNC_COPY(int)
ASYNC_COPY(float)
ASYNC_STRIDED_COPY(uchar)
ASYNC_STRIDED_COPY(char)
ASYNC_STRIDED_COPY(ushort)
ASYNC_STRIDED_COPY(short)
ASYNC_STRIDED_COPY(uint)
ASYNC_STRIDED_COPY(int)
ASYNC_STRIDED_COPY(float)
/*
* OpenCL 1.2, page 279:
* "Wait for events that identify the async_work_group_copy operations to complete.
* The event objects specified in event_list will be released after the wait is performed."
*/
INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE
{
// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
barrier(CLK_GLOBAL_MEM_FENCE);
}
/*
* OpenCL 1.2, page 280:
* "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.
* The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel."
*
* -> Since it doesn't affect the functional behavior, the implementation is a no-op
*/
PREFETCH(uchar)
PREFETCH(char)
PREFETCH(ushort)
PREFETCH(short)
PREFETCH(uint)
PREFETCH(int)
PREFETCH(float)
#undef ASYNC_COPY_INTERNAL
#undef ASYNC_COPY
#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL
#undef ASYNC_STRIDED_DEST_COPY_INTERNAL
#undef ASYNC_STRIDED_COPY
#undef PREFETCH
#endif /* VC4CL_ASYNC_H */

659
drivers/videocore4_stdlib/include/_atomics.h

@ -0,0 +1,659 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_ATOMICS_H
#define VC4CL_ATOMICS_H
#include "_config.h"
#include "_overloads.h"
#include "_intrinsics.h"
INLINE int atomic_add(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_add(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_add(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_add(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_add(volatile __global int *ptr, int val) OVERLOADABLE
{
return atomic_add(ptr, val);
}
INLINE unsigned int atom_add(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
{
return atomic_add(ptr, val);
}
INLINE int atom_add(volatile __local int *ptr, int val) OVERLOADABLE
{
return atomic_add(ptr, val);
}
INLINE unsigned int atom_add(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
{
return atomic_add(ptr, val);
}
INLINE int atomic_sub(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_sub(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_sub(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_sub(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_sub(volatile __global int *ptr, int val) OVERLOADABLE
{
return atomic_sub(ptr, val);
}
INLINE unsigned int atom_sub(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
{
return atomic_sub(ptr, val);
}
INLINE int atom_sub(volatile __local int *ptr, int val) OVERLOADABLE
{
return atomic_sub(ptr, val);
}
INLINE unsigned int atom_sub(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
{
return atomic_sub(ptr, val);
}
INLINE int atomic_xchg(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE float atomic_xchg(volatile __global float * ptr, float val) OVERLOADABLE
{
vc4cl_mutex_lock();
float old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_xchg(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE float atomic_xchg(volatile __local float * ptr, float val) OVERLOADABLE
{
vc4cl_mutex_lock();
float old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_xchg(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE unsigned int atom_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE float atom_xchg(volatile __global float * ptr, float val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE int atom_xchg(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE unsigned int atom_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE float atom_xchg(volatile __local float * ptr, float val) OVERLOADABLE
{
return atomic_xchg(ptr, val);
}
INLINE int atomic_inc(volatile __global int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + 1);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_inc(volatile __global unsigned int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + 1);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_inc(volatile __local int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + 1);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_inc(volatile __local unsigned int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old + 1);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_inc(volatile __global int * ptr) OVERLOADABLE
{
return atomic_inc(ptr);
}
INLINE unsigned int atom_inc(volatile __global unsigned int * ptr) OVERLOADABLE
{
return atomic_inc(ptr);
}
INLINE int atom_inc(volatile __local int * ptr) OVERLOADABLE
{
return atomic_inc(ptr);
}
INLINE unsigned int atom_inc(volatile __local unsigned int * ptr) OVERLOADABLE
{
return atomic_inc(ptr);
}
INLINE int atomic_dec(volatile __global int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - 1);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_dec(volatile __global unsigned int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - 1);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_dec(volatile __local int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - 1);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_dec(volatile __local unsigned int * ptr) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old - 1);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_dec(volatile __global int * ptr) OVERLOADABLE
{
return atomic_dec(ptr);
}
INLINE unsigned int atom_dec(volatile __global unsigned int * ptr) OVERLOADABLE
{
return atomic_dec(ptr);
}
INLINE int atom_dec(volatile __local int * ptr) OVERLOADABLE
{
return atomic_dec(ptr);
}
INLINE unsigned int atom_dec(volatile __local unsigned int * ptr) OVERLOADABLE
{
return atomic_dec(ptr);
}
INLINE int atomic_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, (old == compare) ? val : old);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, (old == compare) ? val : old);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, (old == compare) ? val : old);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, (old == compare) ? val : old);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
{
return atomic_cmpxchg(ptr, compare, val);
}
INLINE unsigned int atom_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
{
return atomic_cmpxchg(ptr, compare, val);
}
INLINE int atom_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
{
return atomic_cmpxchg(ptr, compare, val);
}
INLINE unsigned int atom_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
{
return atomic_cmpxchg(ptr, compare, val);
}
INLINE int atomic_min(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, min(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, min(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_min(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, min(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, min(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_min(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_min(ptr, val);
}
INLINE unsigned int atom_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_min(ptr, val);
}
INLINE int atom_min(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_min(ptr, val);
}
INLINE unsigned int atom_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_min(ptr, val);
}
INLINE int atomic_max(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, max(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, max(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_max(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, max(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, max(old, val));
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_max(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_max(ptr, val);
}
INLINE unsigned int atom_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_max(ptr, val);
}
INLINE int atom_max(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_max(ptr, val);
}
INLINE unsigned int atom_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_max(ptr, val);
}
INLINE int atomic_and(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old & val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old & val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_and(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old & val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old & val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_and(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_and(ptr, val);
}
INLINE unsigned int atom_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_and(ptr, val);
}
INLINE int atom_and(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_and(ptr, val);
}
INLINE unsigned int atom_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_and(ptr, val);
}
INLINE int atomic_or(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old | val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old | val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_or(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old | val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old | val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_or(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_or(ptr, val);
}
INLINE unsigned int atom_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_or(ptr, val);
}
INLINE int atom_or(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_or(ptr, val);
}
INLINE unsigned int atom_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_or(ptr, val);
}
INLINE int atomic_xor(volatile __global int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old ^ val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old ^ val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atomic_xor(volatile __local int * ptr, int val) OVERLOADABLE
{
vc4cl_mutex_lock();
int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old ^ val);
vc4cl_mutex_unlock();
return old;
}
INLINE unsigned int atomic_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
vc4cl_mutex_lock();
unsigned int old = vc4cl_dma_read(ptr);
vc4cl_dma_write(ptr, old ^ val);
vc4cl_mutex_unlock();
return old;
}
INLINE int atom_xor(volatile __global int * ptr, int val) OVERLOADABLE
{
return atomic_xor(ptr, val);
}
INLINE unsigned int atom_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_xor(ptr, val);
}
INLINE int atom_xor(volatile __local int * ptr, int val) OVERLOADABLE
{
return atomic_xor(ptr, val);
}
INLINE unsigned int atom_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
{
return atomic_xor(ptr, val);
}
#endif /* VC4CL_ATOMICS_H */

411
drivers/videocore4_stdlib/include/_clcxx_mangling.h

@ -0,0 +1,411 @@
/*
* OpenCL 2.0 introduces the __generic address space, which is also used by C++ for OpenCL C.
*
* Since we do not actually care about address spaces(so far), we can just map those functions to one of the existing address spaces.
*
* Base list of affected functions generated with:
* llvm-dis -o /dev/stdout ../VC4CLStdLib/include/VC4CLStdLib.bc | grep -oE 'spir_func .?* \S*AS1.*?\)' | sort
*
* This header contains wrapper for the SPIR-mangled functions to the real implementations
*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_GENERIC_MANGLING
#define VC4CL_GENERIC_MANGLING
#include "_config.h"
float _Z4modffPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z4modffPU3AS1f")));
float _Z5fractfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z5fractfPU3AS1f")));
float _Z5frexpfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z5frexpfPU3AS1i")));
float _Z6remquoffPU3AS4i(float, float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6remquoffPU3AS1i")));
float _Z6sincosfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6sincosfPU3AS1f")));
float _Z8lgamma_rfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8lgamma_rfPU3AS1i")));
float2 _Z4modfDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z4modfDv2_fPU3AS1S_")));
float2 _Z5fractDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z5fractDv2_fPU3AS1S_")));
float2 _Z5frexpDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z5frexpDv2_fPU3AS1Dv2_i")));
float2 _Z6remquoDv2_fS_PU3AS4Dv2_i(float2, float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z6remquoDv2_fS_PU3AS1Dv2_i")));
float2 _Z6sincosDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z6sincosDv2_fPU3AS1S_")));
float2 _Z8lgamma_rDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z8lgamma_rDv2_fPU3AS1Dv2_i")));
float3 _Z4modfDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z4modfDv3_fPU3AS1S_")));
float3 _Z5fractDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z5fractDv3_fPU3AS1S_")));
float3 _Z5frexpDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z5frexpDv3_fPU3AS1Dv3_i")));
float3 _Z6remquoDv3_fS_PU3AS4Dv3_i(float3, float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z6remquoDv3_fS_PU3AS1Dv3_i")));
float3 _Z6sincosDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z6sincosDv3_fPU3AS1S_")));
float3 _Z8lgamma_rDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z8lgamma_rDv3_fPU3AS1Dv3_i")));
float4 _Z4modfDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z4modfDv4_fPU3AS1S_")));
float4 _Z5fractDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z5fractDv4_fPU3AS1S_")));
float4 _Z5frexpDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z5frexpDv4_fPU3AS1Dv4_i")));
float4 _Z6remquoDv4_fS_PU3AS4Dv4_i(float4, float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z6remquoDv4_fS_PU3AS1Dv4_i")));
float4 _Z6sincosDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z6sincosDv4_fPU3AS1S_")));
float4 _Z8lgamma_rDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z8lgamma_rDv4_fPU3AS1Dv4_i")));
float8 _Z4modfDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z4modfDv8_fPU3AS1S_")));
float8 _Z5fractDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z5fractDv8_fPU3AS1S_")));
float8 _Z5frexpDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z5frexpDv8_fPU3AS1Dv8_i")));
float8 _Z6remquoDv8_fS_PU3AS4Dv8_i(float8, float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z6remquoDv8_fS_PU3AS1Dv8_i")));
float8 _Z6sincosDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z6sincosDv8_fPU3AS1S_")));
float8 _Z8lgamma_rDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z8lgamma_rDv8_fPU3AS1Dv8_i")));
float16 _Z4modfDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z4modfDv16_fPU3AS1S_")));
float16 _Z5fractDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z5fractDv16_fPU3AS1S_")));
float16 _Z5frexpDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z5frexpDv16_fPU3AS1Dv16_i")));
float16 _Z6remquoDv16_fS_PU3AS4Dv16_i(float16, float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z6remquoDv16_fS_PU3AS1Dv16_i")));
float16 _Z6sincosDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z6sincosDv16_fPU3AS1S_")));
float16 _Z8lgamma_rDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z8lgamma_rDv16_fPU3AS1Dv16_i")));
char2 _Z6vload2jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kc")));
uchar2 _Z6vload2jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kh")));
short2 _Z6vload2jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ks")));
ushort2 _Z6vload2jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kt")));
int2 _Z6vload2jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ki")));
uint2 _Z6vload2jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kj")));
long2 _Z6vload2jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kl")));
ulong2 _Z6vload2jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload2jPU3AS1Km")));
float2 _Z6vload2jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kf")));
char3 _Z6vload3jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kc")));
uchar3 _Z6vload3jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kh")));
short3 _Z6vload3jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ks")));
ushort3 _Z6vload3jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kt")));
int3 _Z6vload3jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ki")));
uint3 _Z6vload3jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kj")));
long3 _Z6vload3jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kl")));
ulong3 _Z6vload3jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload3jPU3AS1Km")));
float3 _Z6vload3jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kf")));
char4 _Z6vload4jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kc")));
uchar4 _Z6vload4jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kh")));
short4 _Z6vload4jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ks")));
ushort4 _Z6vload4jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kt")));
int4 _Z6vload4jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ki")));
uint4 _Z6vload4jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kj")));
long4 _Z6vload4jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kl")));
ulong4 _Z6vload4jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload4jPU3AS1Km")));
float4 _Z6vload4jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kf")));
char8 _Z6vload8jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kc")));
uchar8 _Z6vload8jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kh")));
short8 _Z6vload8jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ks")));
ushort8 _Z6vload8jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kt")));
int8 _Z6vload8jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ki")));
uint8 _Z6vload8jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kj")));
long8 _Z6vload8jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kl")));
ulong8 _Z6vload8jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload8jPU3AS1Km")));
float8 _Z6vload8jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kf")));
char16 _Z7vload16jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kc")));
uchar16 _Z7vload16jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kh")));
short16 _Z7vload16jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ks")));
ushort16 _Z7vload16jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kt")));
int16 _Z7vload16jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ki")));
uint16 _Z7vload16jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kj")));
long16 _Z7vload16jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kl")));
ulong16 _Z7vload16jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vload16jPU3AS1Km")));
float16 _Z7vload16jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kf")));
void _Z7vstore2Dv2_cjPU3AS4c(char2, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore2Dv2_cjPU3AS1c")));
void _Z7vstore2Dv2_hjPU3AS4h(uchar2, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore2Dv2_hjPU3AS1h")));
void _Z7vstore2Dv2_sjPU3AS4s(short2, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore2Dv2_sjPU3AS1s")));
void _Z7vstore2Dv2_tjPU3AS4t(ushort2, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore2Dv2_tjPU3AS1t")));
void _Z7vstore2Dv2_ijPU3AS4i(int2, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore2Dv2_ijPU3AS1i")));
void _Z7vstore2Dv2_jjPU3AS4j(uint2, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore2Dv2_jjPU3AS1j")));
void _Z7vstore2Dv2_ljPU3AS4l(long2, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore2Dv2_ljPU3AS1l")));
void _Z7vstore2Dv2_mjPU3AS4m(ulong2, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore2Dv2_mjPU3AS1m")));
void _Z7vstore2Dv2_fjPU3AS4f(float2, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore2Dv2_fjPU3AS1f")));
void _Z7vstore3Dv3_cjPU3AS4c(char3, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore3Dv3_cjPU3AS1c")));
void _Z7vstore3Dv3_hjPU3AS4h(uchar3, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore3Dv3_hjPU3AS1h")));
void _Z7vstore3Dv3_sjPU3AS4s(short3, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore3Dv3_sjPU3AS1s")));
void _Z7vstore3Dv3_tjPU3AS4t(ushort3, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore3Dv3_tjPU3AS1t")));
void _Z7vstore3Dv3_ijPU3AS4i(int3, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore3Dv3_ijPU3AS1i")));
void _Z7vstore3Dv3_jjPU3AS4j(uint3, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore3Dv3_jjPU3AS1j")));
void _Z7vstore3Dv3_ljPU3AS4l(long3, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore3Dv3_ljPU3AS1l")));
void _Z7vstore3Dv3_mjPU3AS4m(ulong3, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore3Dv3_mjPU3AS1m")));
void _Z7vstore3Dv3_fjPU3AS4f(float3, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore3Dv3_fjPU3AS1f")));
void _Z7vstore4Dv4_cjPU3AS4c(char4, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore4Dv4_cjPU3AS1c")));
void _Z7vstore4Dv4_hjPU3AS4h(uchar4, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore4Dv4_hjPU3AS1h")));
void _Z7vstore4Dv4_sjPU3AS4s(short4, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore4Dv4_sjPU3AS1s")));
void _Z7vstore4Dv4_tjPU3AS4t(ushort4, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore4Dv4_tjPU3AS1t")));
void _Z7vstore4Dv4_ijPU3AS4i(int4, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore4Dv4_ijPU3AS1i")));
void _Z7vstore4Dv4_jjPU3AS4j(uint4, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore4Dv4_jjPU3AS1j")));
void _Z7vstore4Dv4_ljPU3AS4l(long4, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore4Dv4_ljPU3AS1l")));
void _Z7vstore4Dv4_mjPU3AS4m(ulong4, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore4Dv4_mjPU3AS1m")));
void _Z7vstore4Dv4_fjPU3AS4f(float4, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore4Dv4_fjPU3AS1f")));
void _Z7vstore8Dv8_cjPU3AS4c(char8, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore8Dv8_cjPU3AS1c")));
void _Z7vstore8Dv8_hjPU3AS4h(uchar8, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore8Dv8_hjPU3AS1h")));
void _Z7vstore8Dv8_sjPU3AS4s(short8, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore8Dv8_sjPU3AS1s")));
void _Z7vstore8Dv8_tjPU3AS4t(ushort8, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore8Dv8_tjPU3AS1t")));
void _Z7vstore8Dv8_ijPU3AS4i(int8, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore8Dv8_ijPU3AS1i")));
void _Z7vstore8Dv8_jjPU3AS4j(uint8, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore8Dv8_jjPU3AS1j")));
void _Z7vstore8Dv8_ljPU3AS4l(long8, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore8Dv8_ljPU3AS1l")));
void _Z7vstore8Dv8_mjPU3AS4m(ulong8, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore8Dv8_mjPU3AS1m")));
void _Z7vstore8Dv8_fjPU3AS4f(float8, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore8Dv8_fjPU3AS1f")));
void _Z8vstore16Dv16_cjPU3AS4c(char16, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z8vstore16Dv16_cjPU3AS1c")));
void _Z8vstore16Dv16_hjPU3AS4h(uchar16, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z8vstore16Dv16_hjPU3AS1h")));
void _Z8vstore16Dv16_sjPU3AS4s(short16, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z8vstore16Dv16_sjPU3AS1s")));
void _Z8vstore16Dv16_tjPU3AS4t(ushort16, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z8vstore16Dv16_tjPU3AS1t")));
void _Z8vstore16Dv16_ijPU3AS4i(int16, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8vstore16Dv16_ijPU3AS1i")));
void _Z8vstore16Dv16_jjPU3AS4j(uint16, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8vstore16Dv16_jjPU3AS1j")));
void _Z8vstore16Dv16_ljPU3AS4l(long16, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z8vstore16Dv16_ljPU3AS1l")));
void _Z8vstore16Dv16_mjPU3AS4m(ulong16, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z8vstore16Dv16_mjPU3AS1m")));
void _Z8vstore16Dv16_fjPU3AS4f(float16, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z8vstore16Dv16_fjPU3AS1f")));
int _Z10atomic_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vii")));
uint _Z10atomic_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vjj")));
int _Z8atom_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_andPU3AS1Vii")));
uint _Z8atom_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_andPU3AS1Vjj")));
int _Z9atomic_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vii")));
uint _Z9atomic_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vjj")));
int _Z7atom_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z7atom_orPU3AS1Vii")));
uint _Z7atom_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z7atom_orPU3AS1Vjj")));
int _Z10atomic_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vii")));
uint _Z10atomic_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vjj")));
int _Z8atom_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vii")));
uint _Z8atom_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vjj")));
int _Z10atomic_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vi")));
uint _Z10atomic_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vj")));
int _Z8atom_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vi")));
uint _Z8atom_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vj")));
int _Z10atomic_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vi")));
uint _Z10atomic_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vj")));
int _Z8atom_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vi")));
uint _Z8atom_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vj")));
int _Z10atomic_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vii")));
uint _Z10atomic_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vjj")));
int _Z8atom_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vii")));
uint _Z8atom_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vjj")));
int _Z10atomic_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vii")));
uint _Z10atomic_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vjj")));
int _Z8atom_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_minPU3AS1Vii")));
uint _Z8atom_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_minPU3AS1Vjj")));
int _Z10atomic_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vii")));
uint _Z10atomic_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vjj")));
int _Z8atom_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_addPU3AS1Vii")));
uint _Z8atom_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_addPU3AS1Vjj")));
int _Z10atomic_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vii")));
uint _Z10atomic_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vjj")));
int _Z8atom_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_subPU3AS1Vii")));
uint _Z8atom_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_subPU3AS1Vjj")));
int _Z11atomic_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vii")));
uint _Z11atomic_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vjj")));
float _Z11atomic_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vff")));
int _Z9atom_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vii")));
uint _Z9atom_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vjj")));
float _Z9atom_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vff")));
int _Z12atom_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Viii")));
uint _Z12atom_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Vjjj")));
int _Z14atomic_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Viii")));
uint _Z14atomic_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Vjjj")));
/*
%opencl.event_t* _Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event")));
%opencl.event_t* _Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event")));
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event")));
TODO missing wait_group_events function(s)
void _Z8prefetchPU3AS1Kcj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kcj")));
void _Z8prefetchPU3AS1KDv16_cj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_cj")));
void _Z8prefetchPU3AS1KDv16_fj(__attribute__((address_space(4))) float16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_fj")));
void _Z8prefetchPU3AS1KDv16_hj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_hj")));
void _Z8prefetchPU3AS1KDv16_ij(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_ij")));
void _Z8prefetchPU3AS1KDv16_jj(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_jj")));
void _Z8prefetchPU3AS1KDv16_sj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_sj")));
void _Z8prefetchPU3AS1KDv16_tj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_tj")));
void _Z8prefetchPU3AS1KDv2_cj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_cj")));
void _Z8prefetchPU3AS1KDv2_fj(__attribute__((address_space(4))) float2*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_fj")));
void _Z8prefetchPU3AS1KDv2_hj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_hj")));
void _Z8prefetchPU3AS1KDv2_ij(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_ij")));
void _Z8prefetchPU3AS1KDv2_jj(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_jj")));
void _Z8prefetchPU3AS1KDv2_sj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_sj")));
void _Z8prefetchPU3AS1KDv2_tj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_tj")));
void _Z8prefetchPU3AS1KDv3_cj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_cj")));
void _Z8prefetchPU3AS1KDv3_fj(__attribute__((address_space(4))) float3*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_fj")));
void _Z8prefetchPU3AS1KDv3_hj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_hj")));
void _Z8prefetchPU3AS1KDv3_ij(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_ij")));
void _Z8prefetchPU3AS1KDv3_jj(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_jj")));
void _Z8prefetchPU3AS1KDv3_sj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_sj")));
void _Z8prefetchPU3AS1KDv3_tj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_tj")));
void _Z8prefetchPU3AS1KDv4_cj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_cj")));
void _Z8prefetchPU3AS1KDv4_fj(__attribute__((address_space(4))) float4*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_fj")));
void _Z8prefetchPU3AS1KDv4_hj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_hj")));
void _Z8prefetchPU3AS1KDv4_ij(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_ij")));
void _Z8prefetchPU3AS1KDv4_jj(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_jj")));
void _Z8prefetchPU3AS1KDv4_sj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_sj")));
void _Z8prefetchPU3AS1KDv4_tj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_tj")));
void _Z8prefetchPU3AS1KDv8_cj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_cj")));
void _Z8prefetchPU3AS1KDv8_fj(__attribute__((address_space(4))) float8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_fj")));
void _Z8prefetchPU3AS1KDv8_hj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_hj")));
void _Z8prefetchPU3AS1KDv8_ij(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_ij")));
void _Z8prefetchPU3AS1KDv8_jj(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_jj")));
void _Z8prefetchPU3AS1KDv8_sj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_sj")));
void _Z8prefetchPU3AS1KDv8_tj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_tj")));
void _Z8prefetchPU3AS1Kfj(__attribute__((address_space(4))) float*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kfj")));
void _Z8prefetchPU3AS1Khj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Khj")));
void _Z8prefetchPU3AS1Kij(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kij")));
void _Z8prefetchPU3AS1Kjj(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kjj")));
void _Z8prefetchPU3AS1Ksj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ksj")));
void _Z8prefetchPU3AS1Ktj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ktj")));
*/
#endif /* VC4CL_GENERIC_MANGLING */

101
drivers/videocore4_stdlib/include/_common.h

@ -0,0 +1,101 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_COMMON_H
#define VC4CL_COMMON_H
#include "_overloads.h"
#include "_intrinsics.h"
/*
* Common functions
*
* Some functions have no maximum error in the OpenCL specification, see here: https://github.com/KhronosGroup/OpenCL-Docs/issues/33
*
* degrees -> 2 ULP
* radians -> 2 ULP
* mix -> "implementation defined"
* smoothstep -> "implementation defined"
* clamp, min, max, step, sign -> 0 ULP
*/
SIMPLE_3(float, clamp, float, x, float, minval, float, maxval, fmin(fmax(x, minval), maxval))
//TODO version with limits as scalar
// NOTE: using 0x1.ca5dc2p+5 (= 180/M_PI_F + 1 ULP) is slightly more accurate than using 0x1.ca5dcp+5 (180 / M_PI_F),
// but both are accurate enough for 2 ULP maximum error
SIMPLE_1(float, degrees, float, radians, 0x1.ca5dc2p+5 * radians)
// Results are undefined for one of the inputs NaN or Inf,
// so we can directly call the intrinsic and don't need to handle these inputs explicitly
SIMPLE_2(float, max, float, x, float, y, vc4cl_fmax(x, y))
SIMPLE_2_SCALAR(float, max, float, x, float, y, vc4cl_fmax(x, y))
SIMPLE_2(float, min, float, x, float, y, vc4cl_fmin(x, y))
SIMPLE_2_SCALAR(float, min, float, x, float, y, vc4cl_fmin(x, y))
//" Returns the linear blend of x and y implemented as:
// x + (y - x) * a
// a must be a value in the range 0.0 ... 1.0. If a is not in the range 0.0 ... 1.0, the return values are undefined. "
SIMPLE_3(float, mix, float, x, float, y, float, a, x + (y - x) * a)
SIMPLE_3_SCALAR(float, mix, float, x, float, y, float, a, x + (y - x) * a)
SIMPLE_1(float, radians, float, degrees, (M_PI_F / 180) * degrees)
SIMPLE_2(float, step, float, edge, float, val, val < edge ? 0.0f : 1.0f)
INLINE float2 step(float edge, float2 val) OVERLOADABLE
{
return step((float2)edge, val);
}
INLINE float3 step(float edge, float3 val) OVERLOADABLE
{
return step((float3)edge, val);
}
INLINE float4 step(float edge, float4 val) OVERLOADABLE
{
return step((float4)edge, val);
}
INLINE float8 step(float edge, float8 val) OVERLOADABLE
{
return step((float8)edge, val);
}
INLINE float16 step(float edge, float16 val) OVERLOADABLE
{
return step((float16)edge, val);
}
COMPLEX_3(float, smoothstep, float, edge0, float, edge1, float, val,
{
result_t tmp = clamp((result_t) (val - edge0) / (edge1 - edge0), (result_t)0.0f, (result_t)1.0f);
return tmp * tmp * (3 - 2 * tmp);
})
INLINE float2 smoothstep(float edge0, float edge1, float2 val) OVERLOADABLE
{
return smoothstep((float2)edge0, (float2)edge1, val);
}
INLINE float3 smoothstep(float edge0, float edge1, float3 val) OVERLOADABLE
{
return smoothstep((float3)edge0, (float3)edge1, val);
}
INLINE float4 smoothstep(float edge0, float edge1, float4 val) OVERLOADABLE
{
return smoothstep((float4)edge0, (float4)edge1, val);
}
INLINE float8 smoothstep(float edge0, float edge1, float8 val) OVERLOADABLE
{
return smoothstep((float8)edge0, (float8)edge1, val);
}
INLINE float16 smoothstep(float edge0, float edge1, float16 val) OVERLOADABLE
{
return smoothstep((float16)edge0, (float16)edge1, val);
}
SIMPLE_1(float, sign, float, val, val > 0.0f ? 1.0f : val < 0.0f ? -1.0f : 0.0f)
#endif /* VC4CL_COMMON_H */

30
drivers/videocore4_stdlib/include/_config.h

@ -0,0 +1,30 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_CONFIG_H
#define VC4CL_CONFIG_H
#include "defines.h"
#include "opencl-c.h"
#ifndef NULL
#define NULL ((void *)0)
#endif
/*
* Math constants
*/
#define M_LOG210 3.01029995663981195214f /* log_2(10) */
#undef NAN
#define NAN 0x7fffffffU /* same as defined in OpenCL C, but as integer */
#undef INF
#define INF 0x7f800000U
#define ALL_BITS_SET 0xFFFFFFFFU
#endif /* VC4CL_CONFIG_H */

1861
drivers/videocore4_stdlib/include/_conversions.h

File diff suppressed because it is too large Load Diff

173
drivers/videocore4_stdlib/include/_extensions.h

@ -0,0 +1,173 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_EXTENSIONS_H
#define VC4CL_EXTENSIONS_H
#include "_config.h"
#include "_overloads.h"
#include "_intrinsics.h"
/*
* Loop unroll pragma extension
*
* Defines "#pragma unroll <factor>"
*
* CLang supports this natively, so we do not need to do anything
*
* See https://www.khronos.org/registry/OpenCL/extensions/nv/cl_nv_pragma_unroll.txt
* See https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll
*/
#ifndef cl_nv_pragma_unroll
#define cl_nv_pragma_unroll 1
#endif
/*
* ARM core-ID extension
*
* Adds function
* uint arm_get_core_id( void )
* which returns the ID of the OpenCL Computation Unit, which is always zero
*
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_get_core_id.txt
*/
#ifndef cl_arm_core_id
#define cl_arm_core_id 1
#endif
uint arm_get_core_id(void); //prototype, prevents warning
uint arm_get_core_id(void)
{
return 0;
}
/*
* 32-bit atomic counters
*
* Adds type
* counter_32_t
* which is a 32-bit type for atomic counters. counter32_t can only be passed as kernel parameter and cannot be read/assigned.
*
* Adds functions
* uint atomic_inc(counter32_t counter)
* uint atomic_dec(counter32_t counter)
* increments/decrements the given counter32_t value atomically.
*
* NOTE: Since the syntax/semantics is exactly the same as for the uint version of the standard atomic_inc/atomic_dec functions, counter32_t is used as typedef to an uint pointer.
*
* See https://www.khronos.org/registry/OpenCL/extensions/ext/cl_ext_atomic_counters_32.txt
*/
#ifndef cl_ext_atomic_counters_32
#define cl_ext_atomic_counters_32 1
#endif
typedef volatile __global uint* counter32_t;
//just the prototypes, the implementations reside in _atomics.h
uint atomic_inc(counter32_t counter) OVERLOADABLE;
uint atomic_dec(counter32_t counter) OVERLOADABLE;
/*
* Integer dot products
*
* Adds functions
* int arm_dot(char4 a, char4 b)
* uint arm_dot(uchar4 a, uchar4 b)
* int arm_dot_acc(char4 a, char4 b, int acc)
* uint arm_dot_acc(uchar4 a, uchar4 b, uint acc)
* int arm_dot_acc(short2 a, short2 b, int acc)
* uint arm_dot_acc(ushort2 a, ushort2 b, uint acc)
* int arm_dot_acc_sat(char4 a, char4 b, int acc)
* uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc)
* calculate integer dot product (and additionally adds the scalar value).
* For the functions xxx_sat, the final addition is saturating.
*
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_integer_dot_product.txt
*/
#ifndef cl_arm_integer_dot_product_int8
#define cl_arm_integer_dot_product_int8 1
#endif
#ifndef cl_arm_integer_dot_product_accumulate_int8
#define cl_arm_integer_dot_product_accumulate_int8 1
#endif
#ifndef cl_arm_integer_dot_product_accumulate_int16
#define cl_arm_integer_dot_product_accumulate_int16 1
#endif
#ifndef cl_arm_integer_dot_product_accumulate_saturate_int8
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
#endif
// prototypes to prevent warnings
int arm_dot(char4 a, char4 b) OVERLOADABLE;
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE;
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE;
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE;
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE;
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE;
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
/**
* (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w)
*/
int arm_dot(char4 a, char4 b) OVERLOADABLE CONST
{
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
}
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE CONST
{
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
}
/**
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
*/
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE CONST
{
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
}
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
{
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
}
/**
* acc + [ (a.x * b.x) + (a.y * b.y) ]
*/
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE CONST
{
int2 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
return acc + tmp.s0 + tmp.s1;
}
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE CONST
{
uint2 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
return acc + tmp.s0 + tmp.s1;
}
/**
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
*
* The final accumulation is saturating.
*/
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE CONST
{
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
}
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
{
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
}
#endif /* VC4CL_EXTENSIONS_H */

121
drivers/videocore4_stdlib/include/_float_float.h

@ -0,0 +1,121 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
/*
* Implements a float-float floating point type providing improved accuracy over float32.
*
* Algorithms and ideas taken from:
* - Guillaume da Gracca, David Defour. Implementation of float-float operators on graphics hardware. Real Numbers and
* Computers 7, Jul 2006, Nancy, France. pp.23-32. hal-00021443
* https://hal.archives-ouvertes.fr/hal-00021443 (https://hal.archives-ouvertes.fr/hal-00021443/document)
* - https://andrewthall.org/papers/df64_qf128.pdf
*/
#ifndef VC4CL_FLOAT_FLOAT_H
#define VC4CL_FLOAT_FLOAT_H
#include "_intrinsics.h"
/**
* Type for extended precision floating point values.
*
* By combining two 32-bit floats, greatly increases accuracy. Value range is not increased!
*
* The "real" value calculates as UPPER + LOWER part.
*
* Using a native 64-bit type implicitly provides vector versions (and proper handling by compiler)
*/
typedef ulong FloatFloat;
typedef ulong2 FloatFloat2;
typedef ulong3 FloatFloat3;
typedef ulong4 FloatFloat4;
typedef ulong8 FloatFloat8;
typedef ulong16 FloatFloat16;
SIMPLE_1(float, vc4cl_upper, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val)))
SIMPLE_1(float, vc4cl_lower, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val >> 32)))
SIMPLE_1(float, vc4cl_lossy, FloatFloat, val, vc4cl_upper(val) + vc4cl_lower(val))
COMPLEX_2(FloatFloat, vc4cl_combine, float, upper, float, lower, {
result_t upper_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(upper));
result_t lower_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(lower));
return upper_extended | (lower_extended << 32);
})
// faster version of vc4cl_combine(val, 0)
SIMPLE_1(FloatFloat, vc4cl_extend, float, val, vc4cl_int_to_ulong(vc4cl_bitcast_uint(val)))
// TODO avoid using this, since it runs against Inf, due to calculating val * 2^15
COMPLEX_1(FloatFloat, vc4cl_split, float, val, {
// 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
const float split = (float) (1u << 15); // TODO can be modified for precision
arg_t c = (split + 1) * val;
arg_t high = c - (c - val);
arg_t low = val - high;
return vc4cl_combine(high, low);
})
// COMPLEX_1(FloatFloat, vc4cl_split, double, val, {
// // 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
// const double split = (double) (1u << 29); // TODO can be modified for precision
// arg_t c = (split + 1) * val;
// arg_t high = c - (c - val);
// arg_t low = val - high;
// return vc4cl_combine(high, low);
// })
COMPLEX_2(FloatFloat, vc4cl_add, float, a, float, b, {
float_t s = a + b;
float_t v = s - a;
float_t e = (a - (s - v)) + (b - v);
return vc4cl_combine(s, e);
})
COMPLEX_2(FloatFloat, vc4cl_add, FloatFloat, a, FloatFloat, b, {
float_t r = vc4cl_upper(a) + vc4cl_upper(b);
float_t s0 = (((vc4cl_upper(a) - r) + vc4cl_upper(b)) + vc4cl_lower(b)) + vc4cl_lower(a);
float_t s1 = (((vc4cl_upper(b) - r) + vc4cl_upper(a)) + vc4cl_lower(a)) + vc4cl_lower(b);
float_t s = fabs(vc4cl_upper(a)) >= fabs(vc4cl_upper(b)) ? s0 : s1;
return vc4cl_add(r, s);
})
SIMPLE_2(FloatFloat, vc4cl_sub, FloatFloat, a, FloatFloat, b, vc4cl_add(a, vc4cl_combine(-vc4cl_upper(b), -vc4cl_lower(b))))
COMPLEX_2(FloatFloat, vc4cl_mul, float, a, float, b, {
float_t x = a * b;
result_t a_split = vc4cl_split(a);
result_t b_split = vc4cl_split(b);
float_t error1 = x - (vc4cl_upper(a_split) * vc4cl_upper(b_split));
float_t error2 = error1 - (vc4cl_lower(a_split) * vc4cl_upper(b_split));
float_t error3 = error2 - (vc4cl_upper(a_split) * vc4cl_lower(b_split));
float_t y = vc4cl_lower(a_split) * vc4cl_lower(b_split) - error3;
return vc4cl_combine(x, y);
})
COMPLEX_2(FloatFloat, vc4cl_mul, FloatFloat, a, FloatFloat, b, {
result_t t = vc4cl_mul(vc4cl_upper(a), vc4cl_upper(b));
float_t t1 = vc4cl_upper(a) * vc4cl_lower(b) + vc4cl_lower(a) * vc4cl_upper(b) + vc4cl_lower(t);
return vc4cl_add(vc4cl_upper(t), t1);
})
COMPLEX_2(FloatFloat, vc4cl_div, FloatFloat, a, FloatFloat, b, {
float_t xn = 1.0f / vc4cl_upper(b);
float_t yn = vc4cl_upper(a) * xn;
result_t y = vc4cl_extend(yn);
float_t diff = vc4cl_upper(vc4cl_sub(a, vc4cl_mul(b, y)));
result_t prod = vc4cl_mul(xn, diff);
return vc4cl_add(y, prod);
})
COMPLEX_1(FloatFloat, vc4cl_sqrt, FloatFloat, a, {
float_t xn = rsqrt(vc4cl_upper(a));
float_t yn = vc4cl_upper(a) * xn;
result_t y = vc4cl_extend(yn);
result_t ynsqr = vc4cl_mul(y, y); // yn^2
float_t diff = vc4cl_upper(vc4cl_sub(a, ynsqr));
result_t prod = vc4cl_mul(xn, diff) / 2;
return vc4cl_add(y, prod);
})
#endif /* VC4CL_FLOAT_FLOAT_H */

93
drivers/videocore4_stdlib/include/_geometric.h

@ -0,0 +1,93 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_GEOMETRY_H
#define VC4CL_GEOMETRY_H
#include "_config.h"
#include "_overloads.h"
/* a0 b0 a2 * b3 - a3 * b2
* a x b = a1 x b1 = a3 * b1 - a1 * b3
* a2 b2 a1 * b2 - a2 * b1
*/
INLINE float3 cross(float3 p0, float3 p1) OVERLOADABLE CONST
{
return (float3) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x);
}
INLINE float4 cross(float4 p0, float4 p1) OVERLOADABLE CONST
{
return (float4) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x, 0.0f);
}
/* a0 b0
* a * b = a1 * b1 = a1 * b1 + a2 * b2 + a3 * b3
* a2 b2
*/
INLINE float dot(float p0, float p1) OVERLOADABLE CONST
{
return p0 * p1;
}
INLINE float dot(float2 p0, float2 p1) OVERLOADABLE CONST
{
const float2 tmp = p0 * p1;
return tmp.x + tmp.y;
}
INLINE float dot(float3 p0, float3 p1) OVERLOADABLE CONST
{
const float3 tmp = p0 * p1;
return tmp.x + tmp.y + tmp.z;
}
INLINE float dot(float4 p0, float4 p1) OVERLOADABLE CONST
{
const float4 tmp = p0 * p1;
return tmp.x + tmp.y + tmp.z + tmp.w;
}
float dot(float8 p0, float8 p1) OVERLOADABLE CONST;
float dot(float16 p0, float16 p1) OVERLOADABLE CONST;
COMPLEX_1_RETURN_SCALAR(float, length, float, p, {
float tmp = dot(p, p);
// To mitigate overflow errors for edge-cases, reduce large/increase small numbers, this is taken from LLVM libclc
// E.g. since dot(x, x) calculates element-wise x^2, every exponent >= 64 goes to Infinity and every exponent <= -64 to zero!
float inputFactor = 1.0f;
float outputFactor = 1.0f;
outputFactor = tmp == INFINITY ? 0x1.0p+65f : outputFactor;
inputFactor = tmp == INFINITY ? 0x1.0p-65f : inputFactor;
outputFactor = vc4cl_is_zero(tmp) ? 0x1.0p-86f : outputFactor;
inputFactor = vc4cl_is_zero(tmp) ? 0x1.0p+86f : inputFactor;
return sqrt(dot(p * inputFactor, p * inputFactor)) * outputFactor;
})
//"Returns the distance between p0 and p1.
// This is calculated as length(p0 - p1).
SIMPLE_2_RETURN_SCALAR(float, distance, float, p0, float, p1, length(p0 - p1))
/**
* Expected behavior:
*
* normalize(v) = v for all elements in v = 0
* normalize(v) = vector of NaNs for all elements in v = NaN
* TODO special case for Inf elements
*/
SIMPLE_1(float, normalize, float, p, p / length(p))
SIMPLE_1_RETURN_SCALAR(float, fast_length, float, p, half_sqrt(dot(p, p)))
SIMPLE_2_RETURN_SCALAR(float, fast_distance, float, p0, float, p1, fast_length(p0 - p1))
SIMPLE_1(float, fast_normalize, float, p, p * half_rsqrt(dot(p, p)))
#endif /* VC4CL_GEOMETRY_H */

1016
drivers/videocore4_stdlib/include/_images.h

File diff suppressed because it is too large Load Diff

233
drivers/videocore4_stdlib/include/_integer.h

@ -0,0 +1,233 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_INTEGER_H
#define VC4CL_INTEGER_H
#include "_config.h"
#include "_intrinsics.h"
#define SIMPLE_INTEGER_2(func, argName0, argName1, content) \
SIMPLE_2(uchar, func, uchar, argName0, uchar, argName1, content) \
SIMPLE_2(char, func, char, argName0, char, argName1, content) \
SIMPLE_2(ushort, func, ushort, argName0, ushort, argName1, content) \
SIMPLE_2(short, func, short, argName0, short, argName1, content) \
SIMPLE_2(uint, func, uint, argName0, uint, argName1, content) \
SIMPLE_2(int, func, int, argName0, int, argName1, content) \
#define SIMPLE_INTEGER_3(func, argName0, argName1, argName2, content) \
SIMPLE_3(uchar, func, uchar, argName0, uchar, argName1, uchar, argName2, content) \
SIMPLE_3(char, func, char, argName0, char, argName1, char, argName2, content) \
SIMPLE_3(ushort, func, ushort, argName0, ushort, argName1, ushort, argName2, content) \
SIMPLE_3(short, func, short, argName0, short, argName1, short, argName2, content) \
SIMPLE_3(uint, func, uint, argName0, uint, argName1, uint, argName2, content) \
SIMPLE_3(int, func, int, argName0, int, argName1, int, argName2, content) \
SIMPLE_1(uchar, abs, char, val, vc4cl_bitcast_uchar(max(vc4cl_extend(val), -vc4cl_extend(val))))
SIMPLE_1(uchar, abs, uchar, val, val)
SIMPLE_1(ushort, abs, short, val, vc4cl_bitcast_ushort(max(vc4cl_extend(val), -vc4cl_extend(val))))
SIMPLE_1(ushort, abs, ushort, val, val)
SIMPLE_1(uint, abs, int, val, vc4cl_bitcast_uint(max(val, -val)))
SIMPLE_1(uint, abs, uint, val, val)
SIMPLE_1(ulong, abs, long, val, vc4cl_bitcast_ulong(max(val, -val)))
SIMPLE_1(ulong, abs, ulong, val, val)
//based on pocl (pocl/lib/kernel/abs_diff.cl)
SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x))
COMPLEX_2(uchar, abs_diff, char, x, char, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = (result_t)abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x))
COMPLEX_2(ushort, abs_diff, short, x, short, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = (result_t)abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x))
COMPLEX_2(uint, abs_diff, int, x, int, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(ulong, abs_diff, ulong, x, ulong, y, abs(x > y ? x - y : y - x))
COMPLEX_2(ulong, abs_diff, long, x, long, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y))
SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
SIMPLE_2(ushort, add_sat, ushort, x, ushort, y, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) + vc4cl_extend(y), (uint) 0, (uint) USHRT_MAX)))
SIMPLE_2(short, add_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) + vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
//based on pocl (pocl/lib/kernel/add_sat.cl)
SIMPLE_2(uint, add_sat, uint, x, uint, y, x > ((result_t)UINT_MAX) - y ? UINT_MAX : x + y)
SIMPLE_2(int, add_sat, int, x, int, y, vc4cl_saturated_add(x, y))
//"Returns (x + y) >> 1. The intermediate sum does not modulo overflow."
SIMPLE_2(uchar, hadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
SIMPLE_2(char, hadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
SIMPLE_2(ushort, hadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
SIMPLE_2(short, hadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
//based on pocl (pocl/lib/kernel/hadd.cl)
SIMPLE_2(uint, hadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
SIMPLE_2(int, hadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
SIMPLE_2(ulong, hadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
SIMPLE_2(long, hadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
//"Returns (x + y + 1) >> 1. The intermediate sum does not modulo overflow."
SIMPLE_2(uchar, rhadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
SIMPLE_2(char, rhadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
SIMPLE_2(ushort, rhadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
SIMPLE_2(short, rhadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
//based on pocl (pocl/lib/kernel/rhadd.cl)
SIMPLE_2(uint, rhadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
SIMPLE_2(int, rhadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
SIMPLE_2(ulong, rhadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
SIMPLE_2(long, rhadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
SIMPLE_INTEGER_3(clamp, val, minval, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(uchar, clamp, uchar, val, uchar, minval, uchar, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(char, clamp, char, val, char, minval, char, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(ushort, clamp, ushort, val, ushort, minval, ushort, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(short, clamp, short, val, short, minval, short, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(uint, clamp, uint, val, uint, minval, uint, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(int, clamp, int, val, int, minval, int, maxval, min(max(val, minval), maxval))
SIMPLE_3(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
SIMPLE_3(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
SIMPLE_3_TWO_SCALAR(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
SIMPLE_1(uchar, clz, uchar, x, vc4cl_bitcast_uchar(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
SIMPLE_1(char, clz, char, x, vc4cl_bitcast_char(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
SIMPLE_1(ushort, clz, ushort, x, vc4cl_bitcast_ushort(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
SIMPLE_1(short, clz, short, x, vc4cl_bitcast_short(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
SIMPLE_1(uint, clz, uint, x, vc4cl_bitcast_uint(vc4cl_clz(x)))
SIMPLE_1(int, clz, int, x, vc4cl_bitcast_int(vc4cl_clz(x)))
SIMPLE_INTEGER_3(mad_hi, x, y, z, mul_hi(x, y) + z)
SIMPLE_3(uchar, mad_sat, uchar, x, uchar, y, uchar, z, vc4cl_bitcast_uchar(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) UCHAR_MAX)))
SIMPLE_3(char, mad_sat, char, x, char, y, char, z, vc4cl_bitcast_char(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) CHAR_MIN, (int) CHAR_MAX)))
SIMPLE_3(ushort, mad_sat, ushort, x, ushort, y, ushort, z, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) USHRT_MAX)))
SIMPLE_3(short, mad_sat, short, x, short, y, short, z, vc4cl_bitcast_short(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) SHRT_MIN, (int) SHRT_MAX)))
SIMPLE_3(uint, mad_sat, uint, x, uint, y, uint, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_UNSIGNED) + vc4cl_int_to_ulong(z), VC4CL_UNSIGNED))
SIMPLE_3(int, mad_sat, int, x, int, y, int, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_SIGNED) + vc4cl_int_to_long(z), VC4CL_SIGNED))
SIMPLE_2(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
SIMPLE_2_SCALAR(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
SIMPLE_2(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2_SCALAR(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
SIMPLE_2_SCALAR(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
SIMPLE_2(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2_SCALAR(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2(uint, max, uint, x, uint, y, x > y ? x : y)
SIMPLE_2_SCALAR(uint, max, uint, x, uint, y, x > y ? x : y)
SIMPLE_2(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
SIMPLE_2_SCALAR(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
COMPLEX_2(ulong, max, ulong, x, ulong, y,
{
uint_t upX = vc4cl_long_to_int(x >> 32);
uint_t upY = vc4cl_long_to_int(y >> 32);
uint_t lowX = vc4cl_long_to_int(x);
uint_t lowY = vc4cl_long_to_int(y);
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
int_t selection = upX > upY ? 0 : (upX < upY ? 1 : (lowX > lowY ? 0 : 1));
return vc4cl_int_to_long(selection) == 0 ? x : y;
})
SIMPLE_2_SCALAR(ulong, max, ulong, x, ulong, y, max(x, (arg0_t) y))
SIMPLE_2(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
SIMPLE_2_SCALAR(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
SIMPLE_2(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
SIMPLE_2_SCALAR(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
SIMPLE_2(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2_SCALAR(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
SIMPLE_2_SCALAR(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
SIMPLE_2(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2_SCALAR(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
SIMPLE_2(uint, min, uint, x, uint, y, x < y ? x : y)
SIMPLE_2_SCALAR(uint, min, uint, x, uint, y, x < y ? x : y)
SIMPLE_2(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
SIMPLE_2_SCALAR(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
COMPLEX_2(ulong, min, ulong, x, ulong, y,
{
uint_t upX = vc4cl_long_to_int(x >> 32);
uint_t upY = vc4cl_long_to_int(y >> 32);
uint_t lowX = vc4cl_long_to_int(x);
uint_t lowY = vc4cl_long_to_int(y);
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
int_t selection = upX < upY ? 0 : (upX > upY ? 1 : (lowX < lowY ? 0 : 1));
return vc4cl_int_to_long(selection) == 0 ? x : y;
})
SIMPLE_2_SCALAR(ulong, min, ulong, x, ulong, y, min(x, (arg0_t) y))
SIMPLE_2(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
SIMPLE_2_SCALAR(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
SIMPLE_2(uchar, mul_hi, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 8))
SIMPLE_2(char, mul_hi, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_mul24(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED), 8)))
SIMPLE_2(ushort, mul_hi, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 16))
SIMPLE_2(short, mul_hi, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_sign_extend(x) * vc4cl_sign_extend(y), 16)))
SIMPLE_2(uint, mul_hi, uint, x, uint, y, vc4cl_mul_hi(x, y, VC4CL_UNSIGNED))
SIMPLE_2(int, mul_hi, int, x, int, y, vc4cl_mul_hi(x, y, VC4CL_SIGNED))
//Since the rotation is over all 32-bits, for smaller types we need to replicate the value, rotate it and truncate/sign extend the result afterwards
SIMPLE_2(uchar, rotate, uchar, x, uchar, y, vc4cl_pack_lsb(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
SIMPLE_2(char, rotate, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_extend(y)), 24)))
SIMPLE_2(ushort, rotate, ushort, x, ushort, y, vc4cl_pack_truncate(vc4cl_ror(vc4cl_zero_extend(x) | (vc4cl_zero_extend(x) << 16), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
SIMPLE_2(short, rotate, short, x, short, y, vc4cl_bitcast_short(vc4cl_extend(vc4cl_bitcast_short(vc4cl_ror((vc4cl_sign_extend(x) & (int) 0xFFFF) | (vc4cl_sign_extend(x) << 16), -vc4cl_sign_extend(y))))))
SIMPLE_2(uint, rotate, uint, x, uint, y, vc4cl_bitcast_uint(vc4cl_ror(x, -vc4cl_bitcast_int(y))))
SIMPLE_2(int, rotate, int, x, int, y, vc4cl_bitcast_int(vc4cl_ror(x, -y)))
SIMPLE_2(uchar, sub_sat, uchar, x, uchar, y, vc4cl_v8subs(x, y))
SIMPLE_2(char, sub_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) - vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
SIMPLE_2(ushort, sub_sat, ushort, x, ushort, y, x < y ? (result_t)0 : x - y)
SIMPLE_2(short, sub_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) - vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
//based on pocl (pocl/lib/kernel/sub_sat.cl)
SIMPLE_2(uint, sub_sat, uint, x, uint, y, x < y ? (result_t)0 : x - y)
SIMPLE_2(int, sub_sat, int, x, int, y, vc4cl_saturated_sub(x, y))
SIMPLE_2(short, upsample, char, hi, uchar, lo, vc4cl_bitcast_short((vc4cl_sign_extend(hi) << 8) | vc4cl_bitcast_int(vc4cl_zero_extend(lo))))
SIMPLE_2(ushort, upsample, uchar, hi, uchar, lo, vc4cl_bitcast_ushort((vc4cl_zero_extend(hi) << 8) | vc4cl_zero_extend(lo)))
SIMPLE_2(int, upsample, short, hi, ushort, lo, (vc4cl_sign_extend(hi) << 16) | vc4cl_bitcast_int(vc4cl_zero_extend(lo)))
SIMPLE_2(uint, upsample, ushort, hi, ushort, lo, (vc4cl_zero_extend(hi) << 16) | vc4cl_zero_extend(lo))
SIMPLE_2(long, upsample, int, hi, uint, lo, (vc4cl_int_to_long(hi) << 32) | vc4cl_bitcast_long(vc4cl_int_to_ulong(lo)))
SIMPLE_2(ulong, upsample, uint, hi, uint, lo, (vc4cl_int_to_ulong(hi) << 32) | vc4cl_int_to_ulong(lo))
//" Returns the number of non-zero bits in x. "
SIMPLE_1(uchar, popcount, uchar, val, vc4cl_popcount(val))
SIMPLE_1(char, popcount, char, val, vc4cl_popcount(val))
SIMPLE_1(ushort, popcount, ushort, val, vc4cl_popcount(val))
SIMPLE_1(short, popcount, short, val, vc4cl_popcount(val))
SIMPLE_1(uint, popcount, uint, val, vc4cl_popcount(val))
SIMPLE_1(int, popcount, int, val, vc4cl_popcount(val))
SIMPLE_1(ulong, popcount, ulong, val, vc4cl_popcount(val))
SIMPLE_1(long, popcount, long, val, vc4cl_popcount(val))
SIMPLE_2(uchar, mul24, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
SIMPLE_2(char, mul24, char, x, char, y, vc4cl_bitcast_char(vc4cl_mul24(x, y, VC4CL_SIGNED)))
SIMPLE_2(ushort, mul24, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
SIMPLE_2(short, mul24, short, x, short, y, vc4cl_bitcast_short(vc4cl_mul24(x, y, VC4CL_SIGNED)))
SIMPLE_2(uint, mul24, uint, x, uint, y, vc4cl_mul24(x, y, VC4CL_UNSIGNED))
SIMPLE_2(int, mul24, int, x, int, y, vc4cl_mul24(x, y, VC4CL_SIGNED))
SIMPLE_INTEGER_3(mad24, a, b, c, mul24(a, b) + c)
#undef SIMPLE_INTEGER_2
#undef SIMPLE_INTEGER_3
#endif /* VC4CL_INTEGER_H */

436
drivers/videocore4_stdlib/include/_intrinsics.h

@ -0,0 +1,436 @@
/* Declares interfaces for all intrinsic functions
*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_INTRINSICS_H
#define VC4CL_INTRINSICS_H
#include "_overloads.h"
#define VC4CL_SIGNED 0
#define VC4CL_UNSIGNED 1
/*
* ALU operations
*
* NOTE: These operations directly map to the machine instructions and do not
* heed other data-types (e.g. vc4cl_clz will always return the leading zeroes to
* full 32-bit width)
*/
OVERLOAD_2(float, vc4cl_fmax, float, x, float, y)
OVERLOAD_2(float, vc4cl_fmin, float, x, float, y)
OVERLOAD_2(float, vc4cl_fmaxabs, float, x, float, y)
OVERLOAD_2(float, vc4cl_fminabs, float, x, float, y)
OVERLOAD_1(int, vc4cl_ftoi, float, val)
OVERLOAD_1(float, vc4cl_itof, int, val)
OVERLOAD_2(int, vc4cl_asr, uint, val, int, offset)
OVERLOAD_2(int, vc4cl_asr, int, val, int, offset)
OVERLOAD_2(uint, vc4cl_ror, uint, val, int, offset)
OVERLOAD_2(int, vc4cl_ror, int, val, int, offset)
OVERLOAD_3_SCALAR(int, vc4cl_min, int, x, int, y, uchar, sign)
OVERLOAD_3_SCALAR(int, vc4cl_max, int, x, int, y, uchar, sign)
OVERLOAD_3_SCALAR(long, vc4cl_min, long, x, long, y, uchar, sign)
OVERLOAD_3_SCALAR(long, vc4cl_max, long, x, long, y, uchar, sign)
OVERLOAD_2(uint, vc4cl_and, uchar, x, uchar, y)
OVERLOAD_2(int, vc4cl_and, char, x, char, y)
OVERLOAD_2(uint, vc4cl_and, ushort, x, ushort, y)
OVERLOAD_2(int, vc4cl_and, short, x, short, y)
SIMPLE_2(uint, vc4cl_and, uint, x, uint, y, x & y)
SIMPLE_2(int, vc4cl_and, int, x, int, y, x & y)
OVERLOAD_1(uint, vc4cl_clz, uint, val)
OVERLOAD_1(int, vc4cl_clz, int, val)
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uchar, x, uchar, y, uchar, sign)
OVERLOAD_3_SCALAR(int, vc4cl_mul24, char, x, char, y, uchar, sign)
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, ushort, x, ushort, y, uchar, sign)
OVERLOAD_3_SCALAR(int, vc4cl_mul24, short, x, short, y, uchar, sign)
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uint, x, uint, y, uchar, sign)
OVERLOAD_3_SCALAR(int, vc4cl_mul24, int, x, int, y, uchar, sign)
OVERLOAD_2(uchar, vc4cl_v8adds, uchar, x, uchar, y)
OVERLOAD_2(uint, vc4cl_v8adds, uint, x, uint, y)
OVERLOAD_2(uchar, vc4cl_v8subs, uchar, x, uchar, y)
OVERLOAD_2(uint, vc4cl_v8subs, uint, x, uint, y)
OVERLOAD_2(uchar, vc4cl_v8min, uchar, x, uchar, y)
OVERLOAD_2(uint, vc4cl_v8min, uint, x, uint, y)
OVERLOAD_2(uchar, vc4cl_v8max, uchar, x, uchar, y)
OVERLOAD_2(uint, vc4cl_v8max, uint, x, uint, y)
/*
* Pack/unpack modes
*/
//TODO ALU needs to consume float for this to work
//unpacks half to float (UNPACK 1: 16a -> 32)
//OVERLOAD_1(float, vc4cl_unpack_half, half, val)
//sign-extends short to int (UNPACK 1: 16a -> 32)
OVERLOAD_1(int, vc4cl_unpack_sext, short, val)
//unpacks first byte [0, 1] to float (UNPACK 4: 8a -> 32)
OVERLOAD_1(float, vc4cl_unpack_color_byte0, uchar, val)
//unpacks second byte [0, 1] to float (UNPACK 5: 8b -> 32)
OVERLOAD_1(float, vc4cl_unpack_color_byte1, uchar, val)
//unpacks third byte [0, 1] to float (UNPACK 6: 8c -> 32)
OVERLOAD_1(float, vc4cl_unpack_color_byte2, uchar, val)
//unpacks fourth byte [0, 1] to float (UNPACK 7: 8d -> 32)
OVERLOAD_1(float, vc4cl_unpack_color_byte3, uchar, val)
//zero-extend first byte to uint (UNPACK 4: 8a -> 32)
OVERLOAD_1(uint, vc4cl_unpack_byte0, uchar, val)
//zero-extend second byte to uint (UNPACK 5: 8b -> 32)
OVERLOAD_1(uint, vc4cl_unpack_byte1, uchar, val)
//zero-extend third byte to uint (UNPACK 6: 8c -> 32)
OVERLOAD_1(uint, vc4cl_unpack_byte2, uchar, val)
//zero-extend fourth byte to uint (UNPACK 7: 8d -> 32)
OVERLOAD_1(uint, vc4cl_unpack_byte3, uchar, val)
//TODO ALU needs to consume float for this to work
//packs float into half (PACK 1: 32 -> 16a)
//OVERLOAD_1(half, vc4cl_pack_half, float, val)
//converts to unsigned 16-bit integer, truncates the result (PACK 1: 32 -> 16a)
OVERLOAD_1(ushort, vc4cl_pack_truncate, int, val)
OVERLOAD_1(ushort, vc4cl_pack_truncate, uint, val)
//replicates the LSB into all four bytes (PACK 3: 32 -> 8888)
OVERLOAD_1(uint, vc4cl_replicate_lsb, char, val)
OVERLOAD_1(uint, vc4cl_replicate_lsb, uchar, val)
OVERLOAD_1(uint, vc4cl_replicate_lsb, uint, val)
//takes the LSB and writes it into LSB (PACK 4: 32 -> 8a)
OVERLOAD_1(uchar, vc4cl_pack_lsb, char, val)
OVERLOAD_1(uchar, vc4cl_pack_lsb, uchar, val)
OVERLOAD_1(uchar, vc4cl_pack_lsb, uint, val)
//calculates addition, but saturates the result afterwards (depending on signed integer over-/underflow of addition) (uses PACK 8: 32 -> 32)
OVERLOAD_2(int, vc4cl_saturated_add, int, x, int, y)
//NOTE: Since the 32 -> 32 saturation pack mode works differently for sub, the intrinsic is implemented differently than saturated_add
OVERLOAD_2(int, vc4cl_saturated_sub, int, x, int, y)
//saturates to unsigned byte (PACK 12: 32 -> 8a)
OVERLOAD_1(uchar, vc4cl_saturate_lsb, uint, val)
/*
* SFU calls
*/
OVERLOAD_1(float, vc4cl_sfu_recip, float, val)
OVERLOAD_1(float, vc4cl_sfu_rsqrt, float, val)
OVERLOAD_1(float, vc4cl_sfu_log2, float, val)
OVERLOAD_1(float, vc4cl_sfu_exp2, float, val)
/*
* Periphery access
*/
void vc4cl_mutex_lock(void);
void vc4cl_mutex_unlock(void);
//read DMA without locking the mutex
OVERLOAD_1(int, vc4cl_dma_read, volatile __global int, * ptr)
OVERLOAD_1(uint, vc4cl_dma_read, volatile __global uint, * ptr)
OVERLOAD_1(float, vc4cl_dma_read, volatile __global float, * ptr)
OVERLOAD_1(int, vc4cl_dma_read, volatile __local int, * ptr)
OVERLOAD_1(uint, vc4cl_dma_read, volatile __local uint, * ptr)
OVERLOAD_1(float, vc4cl_dma_read, volatile __local float, * ptr)
//write DMA without locking the mutex
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global int, * ptr, int, val)
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global uint, * ptr, uint, val)
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global float, * ptr, float, val)
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local int, * ptr, int, val)
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local uint, * ptr, uint, val)
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local float, * ptr, float, val)
//copy DMA without locking the mutex
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uchar, *dest, const __local uchar, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global char, *dest, const __local char, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global ushort, *dest, const __local ushort, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global short, *dest, const __local short, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uint, *dest, const __local uint, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global int, *dest, const __local int, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global float, *dest, const __local float, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uchar, *dest, const __global uchar, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local char, *dest, const __global char, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local ushort, *dest, const __global ushort, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local short, *dest, const __global short, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uint, *dest, const __global uint, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local int, *dest, const __global int, *src, size_t, num_elements)
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local float, *dest, const __global float, *src, size_t, num_elements)
//load into VPM without locking the mutex
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uchar, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global char, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global ushort, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global short, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uint, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global int, *ptr, size_t, num_elements)
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global float, *ptr, size_t, num_elements)
// special handling of 3-element load/store, since LLVM (compliant with the OpenCL standard) by default generates 4-element load/store
char3 vc4cl_vload3(const __global char* ptr) OVERLOADABLE;
char3 vc4cl_vload3(const __local char* ptr) OVERLOADABLE;
char3 vc4cl_vload3(const __private char* ptr) OVERLOADABLE;
char3 vc4cl_vload3(const __constant char* ptr) OVERLOADABLE;
uchar3 vc4cl_vload3(const __global uchar* ptr) OVERLOADABLE;
uchar3 vc4cl_vload3(const __local uchar* ptr) OVERLOADABLE;
uchar3 vc4cl_vload3(const __private uchar* ptr) OVERLOADABLE;
uchar3 vc4cl_vload3(const __constant uchar* ptr) OVERLOADABLE;
short3 vc4cl_vload3(const __global short* ptr) OVERLOADABLE;
short3 vc4cl_vload3(const __local short* ptr) OVERLOADABLE;
short3 vc4cl_vload3(const __private short* ptr) OVERLOADABLE;
short3 vc4cl_vload3(const __constant short* ptr) OVERLOADABLE;
ushort3 vc4cl_vload3(const __global ushort* ptr) OVERLOADABLE;
ushort3 vc4cl_vload3(const __local ushort* ptr) OVERLOADABLE;
ushort3 vc4cl_vload3(const __private ushort* ptr) OVERLOADABLE;
ushort3 vc4cl_vload3(const __constant ushort* ptr) OVERLOADABLE;
int3 vc4cl_vload3(const __global int* ptr) OVERLOADABLE;
int3 vc4cl_vload3(const __local int* ptr) OVERLOADABLE;
int3 vc4cl_vload3(const __private int* ptr) OVERLOADABLE;
int3 vc4cl_vload3(const __constant int* ptr) OVERLOADABLE;
uint3 vc4cl_vload3(const __global uint* ptr) OVERLOADABLE;
uint3 vc4cl_vload3(const __local uint* ptr) OVERLOADABLE;
uint3 vc4cl_vload3(const __private uint* ptr) OVERLOADABLE;
uint3 vc4cl_vload3(const __constant uint* ptr) OVERLOADABLE;
float3 vc4cl_vload3(const __global float* ptr) OVERLOADABLE;
float3 vc4cl_vload3(const __local float* ptr) OVERLOADABLE;
float3 vc4cl_vload3(const __private float* ptr) OVERLOADABLE;
float3 vc4cl_vload3(const __constant float* ptr) OVERLOADABLE;
long3 vc4cl_vload3(const __global long* ptr) OVERLOADABLE;
long3 vc4cl_vload3(const __local long* ptr) OVERLOADABLE;
long3 vc4cl_vload3(const __private long* ptr) OVERLOADABLE;
long3 vc4cl_vload3(const __constant long* ptr) OVERLOADABLE;
ulong3 vc4cl_vload3(const __global ulong* ptr) OVERLOADABLE;
ulong3 vc4cl_vload3(const __local ulong* ptr) OVERLOADABLE;
ulong3 vc4cl_vload3(const __private ulong* ptr) OVERLOADABLE;
ulong3 vc4cl_vload3(const __constant ulong* ptr) OVERLOADABLE;
void vc4cl_vstore3(__global char* ptr, char3 val) OVERLOADABLE;
void vc4cl_vstore3(__local char* ptr, char3 val) OVERLOADABLE;
void vc4cl_vstore3(__private char* ptr, char3 val) OVERLOADABLE;
void vc4cl_vstore3(__global uchar* ptr, uchar3 val) OVERLOADABLE;
void vc4cl_vstore3(__local uchar* ptr, uchar3 val) OVERLOADABLE;
void vc4cl_vstore3(__private uchar* ptr, uchar3 val) OVERLOADABLE;
void vc4cl_vstore3(__global short* ptr, short3 val) OVERLOADABLE;
void vc4cl_vstore3(__local short* ptr, short3 val) OVERLOADABLE;
void vc4cl_vstore3(__private short* ptr, short3 val) OVERLOADABLE;
void vc4cl_vstore3(__global ushort* ptr, ushort3 val) OVERLOADABLE;
void vc4cl_vstore3(__local ushort* ptr, ushort3 val) OVERLOADABLE;
void vc4cl_vstore3(__private ushort* ptr, ushort3 val) OVERLOADABLE;
void vc4cl_vstore3(__global int* ptr, int3 val) OVERLOADABLE;
void vc4cl_vstore3(__local int* ptr, int3 val) OVERLOADABLE;
void vc4cl_vstore3(__private int* ptr, int3 val) OVERLOADABLE;
void vc4cl_vstore3(__global uint* ptr, uint3 val) OVERLOADABLE;
void vc4cl_vstore3(__local uint* ptr, uint3 val) OVERLOADABLE;
void vc4cl_vstore3(__private uint* ptr, uint3 val) OVERLOADABLE;
void vc4cl_vstore3(__global float* ptr, float3 val) OVERLOADABLE;
void vc4cl_vstore3(__local float* ptr, float3 val) OVERLOADABLE;
void vc4cl_vstore3(__private float* ptr, float3 val) OVERLOADABLE;
void vc4cl_vstore3(__global long* ptr, long3 val) OVERLOADABLE;
void vc4cl_vstore3(__local long* ptr, long3 val) OVERLOADABLE;
void vc4cl_vstore3(__private long* ptr, long3 val) OVERLOADABLE;
void vc4cl_vstore3(__global ulong* ptr, ulong3 val) OVERLOADABLE;
void vc4cl_vstore3(__local ulong* ptr, ulong3 val) OVERLOADABLE;
void vc4cl_vstore3(__private ulong* ptr, ulong3 val) OVERLOADABLE;
/*
* Work-item functions
* Mapped to UNIFORM reads
*
* local values are stored in the a UNIFORM in this fashion:
* | 0 | dim2 | dim1 | dim0 |
* -> to read value of dimension x, calculate: (UNIFORM >> (dim * 8)) & 0xFF
*
* This can be compacted in such way, since for a maximum value of 12, the local ID and size fits into 1 Byte
*/
PURE uchar vc4cl_work_dimensions(void);
PURE uchar vc4cl_local_size(uint dim);
PURE uchar vc4cl_local_id(uint dim);
PURE uint vc4cl_num_groups(uint dim);
PURE uint vc4cl_group_id(uint dim);
PURE uint vc4cl_global_offset(uint dim);
PURE uint vc4cl_global_size(uint dim);
PURE uint vc4cl_global_id(uint dim);
PURE uchar vc4cl_local_linear_id(void);
PURE uint vc4cl_global_linear_id(void);
/*
* Image functions
* In CLang, read_only and write_only image-types are separate types.
* Also in CLang, OpenCL image-types are built-in opaque types
*/
#ifdef __IMAGE_SUPPORT__
/*
* Texture Config Parameter 0
* Broadcom specification, table 15
*
* 0 - 3 | 4 bits | Number of mipmap levels minus 1
* 4 - 7 | 4 bits | texture data type (high bit is on config parameter 1)
* 8 | 1 bit | flip texture Y axis
* 9 | 1 bit | cube map mode
* 10 - 11 | 2 bits | cache swizzle
* 12 - 31 | 20 bits | texture base pointer (multiple of 4KB)
*/
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_basic_setup)
/*
* Texture Config Parameter 1
* Broadcom specification, table 16
*
* 0 - 1 | 2 bits | S (x-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
* 2 - 3 | 2 bits | T (y-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
* 4 - 6 | 3 bits | minification filter (interpolation)
* 7 | 1 bit | magnification filter
* 8 - 18 | 11 bits | image width (0 = 2048)
* 19 | 1 bit | flip ETC Y (per block)
* 20 - 30 | 11 bits | image height (0 = 248)
* 31 | 1 bit | high bit of texture type
*/
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_access_setup)
/*
* Texture Config Parameters 2 and 3
* Broadcom specification, table 17
*
* Cube map stride:
* 0 | 1 bit | disable automatic LOD, use bias only
* 12 - 29 | 18 bits | cube map stride (in multiples of 4KB)
* 30 - 31 | 2 bits | value 1 for cube map stride
*
* Child image dimensions:
* 0 - 10 | 11 bits | child image width (0 = 2048, does not work, see errata HW-2753)
* 12 - 22 | 11 bits | child image height (0 = 2048, does not work, see errata HW-2753)
* 30 - 31 | 2 bits | value 2 for child image dimensions
*
* Child image offsets:
* 0 - 10 | 11 bits | child image X offset
* 12 - 22 | 11 bits | child image Y offset
* 30 - 31 | 2 bits | value 3 for child image offsets
*/
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_extended_setup)
/*
* To apply a sampler to an image, we need to override the image-access setup UNIFORM before a read with the magnification/minification filters and wrap modes to use
*/
OVERLOAD_ALL_IMAGE_TYPES_1(void, vc4cl_set_image_access_setup, uint, val)
CONST uint vc4cl_sampler_get_normalized_coords(sampler_t sampler);
CONST uint vc4cl_sampler_get_addressing_mode(sampler_t sampler);
CONST uint vc4cl_sampler_get_filter_mode(sampler_t sampler);
/*
* Image read functions
*
* The coordinates need to be floating-values in the range [0, 1] and are scaled to the width/height of the image.
* The returned data is not necessarily <4 x int32>, but up to 4 components with up to 32 bits each, loaded according to the byte-sizes and number of components specified in the channel_type_size and channel_order_size.
*
* So, this functions return the data in the native format (as stored in the image-buffer), but correctly distributed across the 4 components.
*/
int4 vc4cl_image_read(read_only image1d_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
int4 vc4cl_image_read(read_only image1d_buffer_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
int4 vc4cl_image_read(read_only image1d_array_t image, float coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
int4 vc4cl_image_read(read_only image2d_t image, float2 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
int4 vc4cl_image_read(read_only image2d_array_t image, float2 coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
int4 vc4cl_image_read(read_only image3d_t image, float4 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
#endif
/*
* Type conversions
*/
// TODO use __builtin_convertvector ?? https://clang.llvm.org/docs/LanguageExtensions.html#builtin-convertvector
// check available on all compiler versions, generated LLVM IR code!
//component-wise bitcasts
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, uint, val)
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, int, val)
OVERLOAD_1(char, vc4cl_bitcast_char, uint, val)
OVERLOAD_1(char, vc4cl_bitcast_char, int, val)
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, uint, val)
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, int, val)
OVERLOAD_1(short, vc4cl_bitcast_short, uint, val)
OVERLOAD_1(short, vc4cl_bitcast_short, int, val)
SIMPLE_1(uint, vc4cl_bitcast_uint, uint, val, val)
OVERLOAD_1(uint, vc4cl_bitcast_uint, int, val)
OVERLOAD_1(int, vc4cl_bitcast_int, uint, val)
SIMPLE_1(int, vc4cl_bitcast_int, int, val, val)
OVERLOAD_1(uint, vc4cl_bitcast_uint, float, val)
OVERLOAD_1(float, vc4cl_bitcast_float, uint, val)
OVERLOAD_1(int, vc4cl_bitcast_int, float, val)
OVERLOAD_1(float, vc4cl_bitcast_float, int, val)
SIMPLE_1(int, vc4cl_sign_extend, char, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFF) << 24, 24))
//SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFFFF) << 16, 16))
SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_unpack_sext(val))
SIMPLE_1(uint, vc4cl_zero_extend, uchar, val, vc4cl_and(val, (arg_t) (0xFFU)))
SIMPLE_1(uint, vc4cl_zero_extend, ushort, val, vc4cl_and(val, (arg_t) (0xFFFFU)))
SIMPLE_1(uint, vc4cl_extend, uchar, val, vc4cl_zero_extend(val))
SIMPLE_1(int, vc4cl_extend, char, val, vc4cl_sign_extend(val))
SIMPLE_1(uint, vc4cl_extend, ushort, val, vc4cl_zero_extend(val))
SIMPLE_1(int, vc4cl_extend, short, val, vc4cl_sign_extend(val))
SIMPLE_1(uint, vc4cl_extend, uint, val, val)
SIMPLE_1(int, vc4cl_extend, int, val, val)
SIMPLE_1(ulong, vc4cl_extend, ulong, val, val)
SIMPLE_1(long, vc4cl_extend, long, val, val)
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, long, val)
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, ulong, val)
OVERLOAD_1(long, vc4cl_bitcast_long, ulong, val)
OVERLOAD_1(long, vc4cl_bitcast_long, long, val)
OVERLOAD_1(uint, vc4cl_long_to_int, ulong, val)
OVERLOAD_1(int, vc4cl_long_to_int, long, val)
OVERLOAD_1(ulong, vc4cl_int_to_ulong, uint, val)
OVERLOAD_1(long, vc4cl_int_to_long, int, val)
SIMPLE_1(ulong, vc4cl_extend_to_long, uint, val, vc4cl_int_to_ulong(val))
SIMPLE_1(long, vc4cl_extend_to_long, int, val, vc4cl_int_to_long(val))
OVERLOAD_2_SCALAR(int, vc4cl_long_to_int_sat, long, val, uchar, sign)
OVERLOAD_2_SCALAR(uint, vc4cl_long_to_int_sat, ulong, val, uchar, sign)
OVERLOAD_1(float, vc4cl_long_to_float, long, val)
OVERLOAD_1(float, vc4cl_ulong_to_float, ulong, val)
/*
* Other functions
*/
SIMPLE_1(uchar, vc4cl_msb_set, uchar, val, vc4cl_bitcast_uchar(vc4cl_extend(val >> 7 == (arg_t)1)))
SIMPLE_1(char, vc4cl_msb_set, char, val, vc4cl_bitcast_char(vc4cl_and((arg_t)(val >> 7), (arg_t)1)) == (arg_t)1)
SIMPLE_1(ushort, vc4cl_msb_set, ushort, val, vc4cl_bitcast_ushort(vc4cl_extend(val >> 15 == (arg_t)1)))
SIMPLE_1(short, vc4cl_msb_set, short, val, vc4cl_bitcast_short(vc4cl_and((arg_t)(val >> 15), (arg_t)1)) == (arg_t)1)
SIMPLE_1(uint, vc4cl_msb_set, uint, val, vc4cl_bitcast_uint(val >> 31 == 1))
SIMPLE_1(int, vc4cl_msb_set, int, val, (val < (arg_t)0))
SIMPLE_1(long, vc4cl_msb_set, ulong, val, (val >> 63 == 1))
SIMPLE_1(long, vc4cl_msb_set, long, val, (val < (arg_t)0))
OVERLOAD_1(int, vc4cl_is_nan, float, val)
OVERLOAD_1(int, vc4cl_is_inf_nan, float, val)
OVERLOAD_1(int, vc4cl_is_zero, float, val)
OVERLOAD_3_SCALAR(int, vc4cl_mul_hi, int, x, int, y, uchar, sign)
OVERLOAD_3_SCALAR(uint, vc4cl_mul_hi, uint, x, uint, y, uchar, sign)
OVERLOAD_3_SCALAR(long, vc4cl_mul_full, int, x, int, y, uchar, sign)
OVERLOAD_3_SCALAR(ulong, vc4cl_mul_full, uint, x, uint, y, uchar, sign)
OVERLOAD_1(uchar, vc4cl_popcount, uchar, val)
OVERLOAD_1(char, vc4cl_popcount, char, val)
OVERLOAD_1(ushort, vc4cl_popcount, ushort, val)
OVERLOAD_1(short, vc4cl_popcount, short, val)
OVERLOAD_1(uint, vc4cl_popcount, uint, val)
OVERLOAD_1(int, vc4cl_popcount, int, val)
OVERLOAD_1(ulong, vc4cl_popcount, ulong, val)
OVERLOAD_1(long, vc4cl_popcount, long, val)
event_t vc4cl_set_event(event_t ev) CONST;
void vc4cl_barrier(cl_mem_fence_flags);
/*
* Vector functions
*/
//Rotates the vector-elements according to the offset (-15 .. +15)
//an offset of 5 means rotate up 5 positions (e.g. x.s0 -> y.s5, x.s10 -> y.15, x.s12 -> y.s1
//NOTE: the rotation is always all 16 elements!! So functions with vector-size of less than 16 MUST not use the positions shifted in from the remaining vector-elements
OVERLOAD_2_SCALAR(uchar, vc4cl_vector_rotate, uchar, val, char, offset)
OVERLOAD_2_SCALAR(char, vc4cl_vector_rotate, char, val, char, offset)
OVERLOAD_2_SCALAR(ushort, vc4cl_vector_rotate, ushort, val, char, offset)
OVERLOAD_2_SCALAR(short, vc4cl_vector_rotate, short, val, char, offset)
OVERLOAD_2_SCALAR(uint, vc4cl_vector_rotate, uint, val, char, offset)
OVERLOAD_2_SCALAR(int, vc4cl_vector_rotate, int, val, char, offset)
OVERLOAD_2_SCALAR(ulong, vc4cl_vector_rotate, ulong, val, char, offset)
OVERLOAD_2_SCALAR(long, vc4cl_vector_rotate, long, val, char, offset)
OVERLOAD_2_SCALAR(float, vc4cl_vector_rotate, float, val, char, offset)
/*
* For debugging purposes
*/
//The vector element number (0 .. 15)
CONST uchar16 vc4cl_element_number(void);
//the ID of the QPU (the processor)
CONST uchar vc4cl_qpu_number(void);
#endif /* VC4CL_INTRINSICS_H */

1666
drivers/videocore4_stdlib/include/_math.h

File diff suppressed because it is too large Load Diff

819
drivers/videocore4_stdlib/include/_overloads.h

@ -0,0 +1,819 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_OVERLOADS_H
#define VC4CL_OVERLOADS_H
#include "_config.h"
#ifndef OVERLOADABLE
#define OVERLOADABLE __attribute__((overloadable))
#endif
/*
* "__attribute__((const)) function attribute
* Many functions examine only the arguments passed to them, and have no effects except for the return value.
* This is a much stricter class than __attribute__((pure)), because a function is not permitted to read global memory.
* If a function is known to operate only on its arguments then it can be subject to common sub-expression elimination and loop optimizations."
*
* http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/Cacgigch.html
*/
#ifndef CONST
#define CONST __attribute__((const)) //tells the compiler, that result won't change
#endif
/*
* "__attribute__((pure)) function attribute
* Many functions have no effects except to return a value, and their return value depends only on the parameters and global variables.
* Functions of this kind can be subject to data flow analysis and might be eliminated."
*
* http://infocenter.arm.com/help/topic/com.arm.doc.dui0491c/Cacigdac.html
*/
#define PURE __attribute__((pure))
#define INLINE __attribute__((always_inline)) __attribute__((flatten)) inline //flatten inlines all call within this function
#define FUNC_1(ret, func, argType, argName) ret func(argType argName) OVERLOADABLE
#ifndef OVERLOAD_1
#define OVERLOAD_1(ret, func, argType, argName) \
FUNC_1(ret##16, func, argType##16, argName); \
FUNC_1(ret##8, func, argType##8, argName); \
FUNC_1(ret##4, func, argType##4, argName); \
FUNC_1(ret##3, func, argType##3, argName); \
FUNC_1(ret##2, func, argType##2, argName); \
FUNC_1(ret, func, argType, argName);
#endif
#ifndef OVERLOAD_1_RETURN_SCALAR
#define OVERLOAD_1_RETURN_SCALAR(ret, func, argType, argName) \
FUNC_1(ret, func, argType##16, argName); \
FUNC_1(ret, func, argType##8, argName); \
FUNC_1(ret, func, argType##4, argName); \
FUNC_1(ret, func, argType##3, argName); \
FUNC_1(ret, func, argType##2, argName); \
FUNC_1(ret, func, argType, argName);
#endif
#define FUNC_2(ret, func, argType0, argName0, argType1, argName1) ret func(argType0 argName0, argType1 argName1) OVERLOADABLE
#ifndef OVERLOAD_2
#define OVERLOAD_2(ret, func, argType0, argName0, argType1, argName1) \
FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1); \
FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1); \
FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1); \
FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1); \
FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1); \
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
#endif
#ifndef OVERLOAD_2_SCALAR
#define OVERLOAD_2_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1); \
FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1); \
FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1); \
FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1); \
FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
#endif
#ifndef OVERLOAD_2_RETURN_SCALAR
#define OVERLOAD_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1); \
FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1); \
FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1); \
FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1); \
FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1); \
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
#endif
#ifndef OVERLOAD_2_SCALAR_RETURN_SCALAR
#define OVERLOAD_2_SCALAR_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
FUNC_2(ret, func, argType0##16, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0##8, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0##4, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0##3, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0##2, argName0, argType1, argName1); \
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
#endif
#define FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) ret func(argType0 argName0, argType1 argName1, argType2 argName2) OVERLOADABLE
#ifndef OVERLOAD_3
#define OVERLOAD_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2); \
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2); \
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2); \
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2); \
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2); \
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
#endif
#ifndef OVERLOAD_3_SCALAR
#define OVERLOAD_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2); \
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2); \
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2); \
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2); \
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2); \
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
#endif
#define FUNC_4(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3) OVERLOADABLE
#define FUNC_5(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3, arg4Type, arg4Name) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3, arg4Type arg4Name) OVERLOADABLE
#ifndef SIMPLE_1
#define SIMPLE_1(ret, func, argType, argName, content) \
INLINE FUNC_1(ret##16, func, argType##16, argName) \
{ \
typedef argType##16 arg_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_1(ret##8, func, argType##8, argName) \
{ \
typedef argType##8 arg_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_1(ret##4, func, argType##4, argName) \
{ \
typedef argType##4 arg_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_1(ret##3, func, argType##3, argName) \
{ \
typedef argType##3 arg_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_1(ret##2, func, argType##2, argName) \
{ \
typedef argType##2 arg_t;\
typedef ret##2 result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType, argName) \
{ \
typedef argType arg_t;\
typedef ret result_t;\
return content; \
}
#endif
#ifndef SIMPLE_1_RETURN_SCALAR
#define SIMPLE_1_RETURN_SCALAR(ret, func, argType, argName, content) \
INLINE FUNC_1(ret, func, argType##16, argName) \
{ \
typedef argType##16 arg_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType##8, argName) \
{ \
typedef argType##8 arg_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType##4, argName) \
{ \
typedef argType##4 arg_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType##3, argName) \
{ \
typedef argType##3 arg_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType##2, argName) \
{ \
typedef argType##2 arg_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_1(ret, func, argType, argName) \
{ \
typedef argType arg_t;\
typedef ret result_t;\
return content; \
}
#endif
#ifndef SIMPLE_2
#define SIMPLE_2(ret, func, argType0, argName0, argType1, argName1, content) \
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef ret##2 result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef ret result_t;\
return content; \
}
#endif
#ifndef SIMPLE_2_RETURN_SCALAR
#define SIMPLE_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
INLINE FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef ret result_t;\
return content; \
} \
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef ret result_t;\
return content; \
}
#endif
#ifndef SIMPLE_2_SCALAR
#define SIMPLE_2_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1 arg1_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1 arg1_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1 arg1_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1 arg1_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1 arg1_t;\
typedef ret##2 result_t;\
return content; \
} \
//scalar part is skipped, since it is too often already defined for e.g. a version taking two vectors
#endif
#ifndef SIMPLE_3
#define SIMPLE_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef argType2##16 arg2_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef argType2##8 arg2_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef argType2##4 arg2_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef argType2##3 arg2_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef argType2##2 arg2_t;\
typedef ret##2 result_t;\
return content; \
} \
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret result_t;\
return content; \
}
#endif
#ifndef SIMPLE_3_SCALAR
#define SIMPLE_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##2 result_t;\
return content; \
} \
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
#endif
#ifndef SIMPLE_3_TWO_SCALAR
#define SIMPLE_3_TWO_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##16 result_t;\
return content; \
} \
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##8 result_t;\
return content; \
} \
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##4 result_t;\
return content; \
} \
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##3 result_t;\
return content; \
} \
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##2 result_t;\
return content; \
} \
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
#endif
#ifndef COMPLEX_1
#define COMPLEX_1(ret, func, argType, argName, content) \
INLINE FUNC_1(ret##16, func, argType##16, argName) \
{ \
typedef argType##16 arg_t;\
typedef ret##16 result_t;\
typedef int##16 int_t; \
typedef float##16 float_t; \
content \
} \
INLINE FUNC_1(ret##8, func, argType##8, argName) \
{ \
typedef argType##8 arg_t;\
typedef ret##8 result_t;\
typedef int##8 int_t; \
typedef float##8 float_t; \
content \
} \
INLINE FUNC_1(ret##4, func, argType##4, argName) \
{ \
typedef argType##4 arg_t;\
typedef ret##4 result_t;\
typedef int##4 int_t; \
typedef float##4 float_t; \
content \
} \
INLINE FUNC_1(ret##3, func, argType##3, argName) \
{ \
typedef argType##3 arg_t;\
typedef ret##3 result_t;\
typedef int##3 int_t; \
typedef float##3 float_t; \
content \
} \
INLINE FUNC_1(ret##2, func, argType##2, argName) \
{ \
typedef argType##2 arg_t;\
typedef ret##2 result_t;\
typedef int##2 int_t; \
typedef float##2 float_t; \
content \
} \
INLINE FUNC_1(ret, func, argType, argName) \
{ \
typedef argType arg_t;\
typedef ret result_t;\
typedef int int_t; \
typedef float float_t; \
content \
}
#endif
#ifndef COMPLEX_1_RETURN_SCALAR
#define COMPLEX_1_RETURN_SCALAR(ret, func, argType, argName, content) \
INLINE FUNC_1(ret, func, argType##16, argName) \
{ \
typedef argType##16 arg_t;\
typedef int##16 int_t; \
content \
} \
INLINE FUNC_1(ret, func, argType##8, argName) \
{ \
typedef argType##8 arg_t;\
typedef int##8 int_t; \
content \
} \
INLINE FUNC_1(ret, func, argType##4, argName) \
{ \
typedef argType##4 arg_t;\
typedef int##4 int_t; \
content \
} \
INLINE FUNC_1(ret, func, argType##3, argName) \
{ \
typedef argType##3 arg_t;\
typedef int##3 int_t; \
content \
} \
INLINE FUNC_1(ret, func, argType##2, argName) \
{ \
typedef argType##2 arg_t;\
typedef int##2 int_t; \
content \
} \
INLINE FUNC_1(ret, func, argType, argName) \
{ \
typedef argType arg_t;\
typedef int int_t; \
content \
}
#endif
#ifndef COMPLEX_2
#define COMPLEX_2(ret, func, argType0, argName0, argType1, argName1, content) \
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef ret##16 result_t;\
typedef int##16 int_t; \
typedef uint##16 uint_t; \
typedef float##16 float_t; \
content \
} \
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef ret##8 result_t;\
typedef int##8 int_t; \
typedef uint##8 uint_t; \
typedef float##8 float_t; \
content \
} \
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef ret##4 result_t;\
typedef int##4 int_t; \
typedef uint##4 uint_t; \
typedef float##4 float_t; \
content \
} \
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef ret##3 result_t;\
typedef int##3 int_t; \
typedef uint##3 uint_t; \
typedef float##3 float_t; \
content \
} \
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef ret##2 result_t;\
typedef int##2 int_t; \
typedef uint##2 uint_t; \
typedef float##2 float_t; \
content \
} \
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef ret result_t;\
typedef int int_t; \
typedef uint uint_t; \
typedef float float_t; \
content \
}
#endif
#ifndef COMPLEX_3
#define COMPLEX_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef argType2##16 arg2_t;\
typedef ret##16 result_t;\
typedef int##16 int_t; \
content \
} \
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef argType2##8 arg2_t;\
typedef ret##8 result_t;\
typedef int##8 int_t; \
content \
} \
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef argType2##4 arg2_t;\
typedef ret##4 result_t;\
typedef int##4 int_t; \
content \
} \
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef argType2##3 arg2_t;\
typedef ret##3 result_t;\
typedef int##3 int_t; \
content \
} \
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef argType2##2 arg2_t;\
typedef ret##2 result_t;\
typedef int##2 int_t; \
content \
} \
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret result_t;\
typedef int int_t; \
content \
}
#endif
#ifndef COMPLEX_3_SCALAR
#define COMPLEX_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
{ \
typedef argType0##16 arg0_t;\
typedef argType1##16 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##16 result_t;\
typedef int##16 int_t; \
content \
} \
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
{ \
typedef argType0##8 arg0_t;\
typedef argType1##8 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##8 result_t;\
typedef int##8 int_t; \
content \
} \
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
{ \
typedef argType0##4 arg0_t;\
typedef argType1##4 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##4 result_t;\
typedef int##4 int_t; \
content \
} \
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
{ \
typedef argType0##3 arg0_t;\
typedef argType1##3 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##3 result_t;\
typedef int##3 int_t; \
content \
} \
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
{ \
typedef argType0##2 arg0_t;\
typedef argType1##2 arg1_t;\
typedef argType2 arg2_t;\
typedef ret##2 result_t;\
typedef int##2 int_t; \
content \
} \
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
{ \
typedef argType0 arg0_t;\
typedef argType1 arg1_t;\
typedef argType2 arg2_t;\
typedef ret result_t;\
typedef int int_t; \
content \
}
#endif
#define OVERLOAD_ALL_IMAGE_TYPES(ret, func) \
CONST FUNC_1(ret, func, read_only image1d_t, image); \
CONST FUNC_1(ret, func, write_only image1d_t, image); \
CONST FUNC_1(ret, func, read_only image2d_t, image); \
CONST FUNC_1(ret, func, write_only image2d_t, image); \
CONST FUNC_1(ret, func, read_only image3d_t, image); \
/* XXX CONST FUNC_1(ret, func, write_only image3d_t, image); */ \
CONST FUNC_1(ret, func, read_only image1d_buffer_t, image); \
CONST FUNC_1(ret, func, write_only image1d_buffer_t, image); \
CONST FUNC_1(ret, func, read_only image1d_array_t, image); \
CONST FUNC_1(ret, func, write_only image1d_array_t, image); \
CONST FUNC_1(ret, func, read_only image2d_array_t, image); \
CONST FUNC_1(ret, func, write_only image2d_array_t, image);
#define OVERLOAD_ALL_IMAGE_TYPES_1(ret, func, argType, argName) \
FUNC_2(ret, func, read_only image1d_t, image, argType, argName); \
FUNC_2(ret, func, write_only image1d_t, image, argType, argName); \
FUNC_2(ret, func, read_only image2d_t, image, argType, argName); \
FUNC_2(ret, func, write_only image2d_t, image, argType, argName); \
FUNC_2(ret, func, read_only image3d_t, image, argType, argName); \
/* XXX FUNC_2(ret, func, write_only image3d_t, image, argType, argName); */ \
FUNC_2(ret, func, read_only image1d_buffer_t, image, argType, argName); \
FUNC_2(ret, func, write_only image1d_buffer_t, image, argType, argName); \
FUNC_2(ret, func, read_only image1d_array_t, image, argType, argName); \
FUNC_2(ret, func, write_only image1d_array_t, image, argType, argName); \
FUNC_2(ret, func, read_only image2d_array_t, image, argType, argName); \
FUNC_2(ret, func, write_only image2d_array_t, image, argType, argName);
#define OVERLOAD_ALL_IMAGE_TYPES_2(ret, func, arg0Type, arg0Name, arg1Type, arg1Name) \
FUNC_3(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
/* XXX FUNC_3(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); */ \
FUNC_3(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
FUNC_3(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name);
#define OVERLOAD_ALL_IMAGE_TYPES_3(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name) \
FUNC_4(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
/* XXX FUNC_4(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); */ \
FUNC_4(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
FUNC_4(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name);
#define OVERLOAD_ALL_IMAGE_TYPES_4(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name) \
FUNC_5(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
/* XXX FUNC_5(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); */ \
FUNC_5(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
FUNC_5(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name);
#endif /* VC4CL_OVERLOADS_H */

43
drivers/videocore4_stdlib/include/_printf.h

@ -0,0 +1,43 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_PRINTF
#define VC4CL_PRINTF
#include "_config.h"
//void* vc4cl_get_param(uint);
//void vc4cl_print_char(char);
//
//INLINE int printf(__constant const char * restrict format, ...)
//{
// __constant const char* formatPtr = format;
// uint paramIndex = 1;
// while(*format != '\0')
// {
// if(*format == '%')
// {
// ++formatPtr;
// switch(*formatPtr)
// {
// case '%':
// vc4cl_print_char('%');
// break;
// case 'c':
// vc4cl_print_char(*vc4cl_get_param(paramIndex));
// case 's':
//
// }
// }
// else
// vc4cl_print_char(*formatPtr);
// ++formatPtr;
// }
// //TODO
// return -1;
//}
#endif /* VC4CL_PRINTF */

341
drivers/videocore4_stdlib/include/_relational.h

@ -0,0 +1,341 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_RELATIONAL_H
#define VC4CL_RELATIONAL_H
#include "_config.h"
#include "_overloads.h"
#include "_intrinsics.h"
#ifndef COMPARISON_1
#define COMPARISON_1(func, content) \
INLINE FUNC_1(int##16, func, float##16, val) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_1(int##8, func, float##8, val) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_1(int##4, func, float##4, val) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_1(int##3, func, float##3, val) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_1(int##2, func, float##2, val) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_1(int, func, float, val) CONST \
{ /* 1 instead of -1 here on purpose! */ \
return (content) ? 1 : 0; \
}
#endif
#ifndef COMPARISON_2
#define COMPARISON_2(func, content) \
INLINE FUNC_2(int##16, func, float##16, x, float##16, y) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_2(int##8, func, float##8, x, float##8, y) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_2(int##4, func, float##4, x, float##4, y) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_2(int##3, func, float##3, x, float##3, y) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_2(int##2, func, float##2, x, float##2, y) CONST \
{ \
return (content) ? -1 : 0; \
} \
INLINE FUNC_2(int, func, float, x, float, y) CONST \
{ /* 1 instead of -1 here on purpose! */ \
return (content) ? 1 : 0; \
}
#endif
#ifndef FOR_ALL_ELEMENTS
#define FOR_ALL_ELEMENTS(func, type, op, conv) \
INLINE FUNC_1(int, func, type##16, x) CONST \
{ \
/* (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf) */ \
type##16 val0 = conv(x); \
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op s8, s8 op s9, s9 op sa, sa op sb, sb op sc, sc op sd, sd op se, se op sf, sf op s0) */ \
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op s8, s6 op s7 op s8 op s9, s7 op s8 op s9 op sa, s8 op s9 op sa op sb, s9 op sa op sb op sc, sa op sb op sc op sd, sb op sc op sd op se, sc op sd op se op sf, ...) */ \
const type##16 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
/* (s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7, ..., s8 op s9 op sa op ab op sc op sd op se op sf, ...) */ \
const type##16 val2 = val1 op vc4cl_vector_rotate(val1, -4); \
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 op s8 op s9 op sa op ab op sc op sd op se op sf */ \
return (val2 op val1 op vc4cl_vector_rotate(val2, -8)).x != 0; \
} \
INLINE FUNC_1(int, func, type##8, x) CONST \
{ \
/* (s0, s1, s2, s3, s4, s5, s6, s7) */ \
type##8 val0 = conv(x); \
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op ?) */ \
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op ?, s6 op s7 op ? op ?, s7 op ? op ? op ?) */ \
const type##8 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 */ \
return (val1 op vc4cl_vector_rotate(val1, -4)).x != 0; \
} \
INLINE FUNC_1(int, func, type##4, x) CONST \
{ \
/* (x, y, z, w) */ \
type##4 val0 = conv(x); \
/* (x op y, y op z, z op w, w op ?) */ \
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
/* (z op w, w op ?, ? op ?, ? op ?) */ \
const type##4 val1 = vc4cl_vector_rotate(val0, -2); \
/* (x op y op z op w, ...) */ \
return (val0 op val1).x != 0; \
} \
INLINE FUNC_1(int, func, type##3, x) CONST \
{ \
type##3 val = conv(x); \
return (val.x op val.y op val.z) != 0; \
} \
INLINE FUNC_1(int, func, type##2, x) CONST \
{ \
type##2 val = conv(x); \
return (val.x op val.y) != 0; \
} \
INLINE FUNC_1(int, func, type, x) CONST \
{ \
type val = conv(x); \
return val != 0; \
}
#endif
#ifndef SELECT_SCALAR
#define SELECT_SCALAR(type, maskType, content) \
INLINE FUNC_3(type, select, type, a, type, b, maskType, c) CONST \
{ \
return content; \
}
#endif
#ifndef SELECT_VECTOR
#define SELECT_VECTOR(type, maskType, content) \
INLINE FUNC_3(type##2, select, type##2, a, type##2, b, maskType##2, c) CONST \
{ \
typedef int##2 int_t; \
content \
} \
INLINE FUNC_3(type##3, select, type##3, a, type##3, b, maskType##3, c) CONST \
{ \
typedef int##3 int_t; \
content \
} \
INLINE FUNC_3(type##4, select, type##4, a, type##4, b, maskType##4, c) CONST \
{ \
typedef int##4 int_t; \
content \
} \
INLINE FUNC_3(type##8, select, type##8, a, type##8, b, maskType##8, c) CONST \
{ \
typedef int##8 int_t; \
content \
} \
INLINE FUNC_3(type##16, select, type##16, a, type##16, b, maskType##16, c) CONST \
{ \
typedef int##16 int_t; \
content \
}
#endif
/*
* The checks for NaNs as defined in the specification are done in the intrinsic of the comparison operators:
*
* "The relational functions isequal, isgreater, isgreaterequal, isless, islessequal, and islessgreater
* always return 0 if either argument is not a number (NaN). isnotequal returns 1 if one or both
* arguments are not a number (NaN) and the argument type is a scalar [...]"
* - OpenCL 1.2, section 6.12.6 Relational Functions
*/
COMPARISON_2(isequal, x == y)
COMPARISON_2(isnotequal, x != y)
COMPARISON_2(isgreater, x > y)
COMPARISON_2(isgreaterequal, x >= y)
COMPARISON_2(isless, x < y)
COMPARISON_2(islessequal, x <= y)
COMPARISON_2(islessgreater, (x < y) || (x > y))
// From <cmath>: "A finite value is any floating-point value that is neither infinite nor NaN (Not-A-Number)."
COMPARISON_1(isfinite, !vc4cl_is_inf_nan(val))
COMPARISON_1(isinf, (vc4cl_bitcast_uint(val) & NAN) == INF)
COMPARISON_1(isnan, vc4cl_is_nan(val))
// From <cmath>: "Returns whether x is a normal value: i.e., whether it is neither infinity, NaN, zero or subnormal."
COMPARISON_1(isnormal, !isinf(val) && !isnan(val) && ((vc4cl_bitcast_uint(val) & 0x7F800000) != 0) /* neither zero nor denormal */)
COMPARISON_2(isordered, isequal(x, x) && isequal(y, y))
COMPARISON_2(isunordered, isnan(x) || isnan(y))
// for vector,directly use asr, for scalar shr. This is way more efficient than everything else (1 instruction)
INLINE FUNC_1(int16, signbit, float16, val) CONST
{
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
}
INLINE FUNC_1(int8, signbit, float8, val) CONST
{
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
}
INLINE FUNC_1(int4, signbit, float4, val) CONST
{
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
}
INLINE FUNC_1(int3, signbit, float3, val) CONST
{
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
}
INLINE FUNC_1(int2, signbit, float2, val) CONST
{
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
}
INLINE FUNC_1(int, signbit, float, val) CONST
{
return vc4cl_bitcast_uint(val) >> 31;
}
FOR_ALL_ELEMENTS(any, char, |, vc4cl_msb_set)
FOR_ALL_ELEMENTS(any, short, |, vc4cl_msb_set)
FOR_ALL_ELEMENTS(any, int, |, vc4cl_msb_set)
FOR_ALL_ELEMENTS(any, long, |, vc4cl_msb_set)
FOR_ALL_ELEMENTS(all, char, &, vc4cl_msb_set)
FOR_ALL_ELEMENTS(all, short, &, vc4cl_msb_set)
FOR_ALL_ELEMENTS(all, int, &, vc4cl_msb_set)
FOR_ALL_ELEMENTS(all, long, &, vc4cl_msb_set)
//"Each bit of the result is the corresponding bit of a if the corresponding bit of c is 0.
// Otherwise it is the corresponding bit of b."
//based on pocl (pocl/lib/kernel/bitselect.cl)
SIMPLE_3(uchar, bitselect, uchar, a, uchar, b, uchar, c, (~c & a) | (c & b))
SIMPLE_3(char, bitselect, char, a, char, b, char, c, (~c & a) | (c & b))
SIMPLE_3(ushort, bitselect, ushort, a, ushort, b, ushort, c, (~c & a) | (c & b))
SIMPLE_3(short, bitselect, short, a, short, b, short, c, (~c & a) | (c & b))
SIMPLE_3(uint, bitselect, uint, a, uint, b, uint, c, (~c & a) | (c & b))
SIMPLE_3(int, bitselect, int, a, int, b, int, c, (~c & a) | (c & b))
SIMPLE_3(ulong, bitselect, ulong, a, ulong, b, ulong, c, (~c & a) | (c & b))
SIMPLE_3(long, bitselect, long, a, long, b, long, c, (~c & a) | (c & b))
SIMPLE_3(float, bitselect, float, a, float, b, float, c, vc4cl_bitcast_float((~vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(a)) | (vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(b))))
//"For a scalar type, result = c ? b : a."
SELECT_SCALAR(uchar, uchar, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(uchar, char, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(char, uchar, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(char, char, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(ushort, ushort, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(ushort, short, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(short, ushort, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(short, short, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(uint, uint, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(uint, int, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(int, uint, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(int, int, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(ulong, ulong, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(ulong, long, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(long, ulong, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(long, long, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(float, uint, vc4cl_extend(c) ? b : a)
SELECT_SCALAR(float, int, vc4cl_extend(c) ? b : a)
//"For each component of a vector type, result[i] = if MSB of c[i] is set ? b[i] : a[i]"
SELECT_VECTOR(uchar, uchar,
{
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
})
SELECT_VECTOR(uchar, char,
{
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
})
SELECT_VECTOR(char, char,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(char, uchar,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(ushort, ushort,
{
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
})
SELECT_VECTOR(ushort, short,
{
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
})
SELECT_VECTOR(short, short,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(short, ushort,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(uint, uint,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(uint, int,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(int, int,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(int, uint,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(ulong, ulong,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(ulong, long,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(long, long,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(long, ulong,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(float, uint,
{
return vc4cl_msb_set(c) ? b : a;
})
SELECT_VECTOR(float, int,
{
return vc4cl_msb_set(c) ? b : a;
})
#undef COMPARISON_1
#undef COMPARISON_2
#undef FOR_ALL_ELEMENTS
#undef SELECT_SCALAR
#undef SELECT_VECTOR
#endif /* VC4CL_RELATIONAL_H */

1716
drivers/videocore4_stdlib/include/_spir_mangling.h

File diff suppressed because it is too large Load Diff

24
drivers/videocore4_stdlib/include/_synchronization.h

@ -0,0 +1,24 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_SYNCHRONIZATION_H
#define VC4CL_SYNCHRONIZATION_H
#include "_config.h"
#include "_work_items.h"
INLINE void barrier(cl_mem_fence_flags flags) OVERLOADABLE
{
vc4cl_barrier(flags);
}
/*
* We do not declare read_mem_fence() and write_mem_fence(), since:
* - The SPIRV-LLVM-Translator (in older versions, e.g. 7.0) can't handle them passing a non-const flags to the mem_fence() function
* - We anyway handle mem_fence(), read_mem_fence() and write_mem_fence() in both front-ends the exact same way
*/
#endif /* VC4CL_SYNCHRONIZATION_H */

265
drivers/videocore4_stdlib/include/_vector.h

@ -0,0 +1,265 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_VECTOR_H
#define VC4CL_VECTOR_H
#include "_config.h"
#include "_overloads.h"
#ifndef VECTOR_LOAD
#define VECTOR_LOAD(type) \
INLINE type##2 vload2(size_t offset, const __global type * ptr) OVERLOADABLE \
{ \
return *((const __global type##2 *)(ptr + offset * 2)); \
} \
INLINE type##3 vload3(size_t offset, const __global type * ptr) OVERLOADABLE \
{ \
return vc4cl_vload3(ptr + offset * 3); \
} \
INLINE type##4 vload4(size_t offset, const __global type * ptr) OVERLOADABLE \
{ \
return *((const __global type##4 *)(ptr + offset * 4)); \
} \
INLINE type##8 vload8(size_t offset, const __global type * ptr) OVERLOADABLE \
{ \
return *((const __global type##8 *)(ptr + offset * 8)); \
} \
INLINE type##16 vload16(size_t offset, const __global type * ptr) OVERLOADABLE \
{ \
return *((const __global type##16 *)(ptr + offset * 16)); \
} \
INLINE type##2 vload2(size_t offset, const __local type * ptr) OVERLOADABLE \
{ \
return *((const __local type##2 *)(ptr + offset * 2)); \
} \
INLINE type##3 vload3(size_t offset, const __local type * ptr) OVERLOADABLE \
{ \
return vc4cl_vload3(ptr + offset * 3); \
} \
INLINE type##4 vload4(size_t offset, const __local type * ptr) OVERLOADABLE \
{ \
return *((const __local type##4 *)(ptr + offset * 4)); \
} \
INLINE type##8 vload8(size_t offset, const __local type * ptr) OVERLOADABLE \
{ \
return *((const __local type##8 *)(ptr + offset * 8)); \
} \
INLINE type##16 vload16(size_t offset, const __local type * ptr) OVERLOADABLE \
{ \
return *((const __local type##16 *)(ptr + offset * 16)); \
} \
INLINE type##2 vload2(size_t offset, const __constant type * ptr) OVERLOADABLE \
{ \
return *((const __constant type##2 *)(ptr + offset * 2)); \
} \
INLINE type##3 vload3(size_t offset, const __constant type * ptr) OVERLOADABLE \
{ \
return vc4cl_vload3(ptr + offset * 3); \
} \
INLINE type##4 vload4(size_t offset, const __constant type * ptr) OVERLOADABLE \
{ \
return *((const __constant type##4 *)(ptr + offset * 4)); \
} \
INLINE type##8 vload8(size_t offset, const __constant type * ptr) OVERLOADABLE \
{ \
return *((const __constant type##8 *)(ptr + offset * 8)); \
} \
INLINE type##16 vload16(size_t offset, const __constant type * ptr) OVERLOADABLE \
{ \
return *((const __constant type##16 *)(ptr + offset * 16)); \
} \
INLINE type##2 vload2(size_t offset, const __private type * ptr) OVERLOADABLE \
{ \
return *((const __private type##2 *)(ptr + offset * 2)); \
} \
INLINE type##3 vload3(size_t offset, const __private type * ptr) OVERLOADABLE \
{ \
return vc4cl_vload3(ptr + offset * 3); \
} \
INLINE type##4 vload4(size_t offset, const __private type * ptr) OVERLOADABLE \
{ \
return *((const __private type##4 *)(ptr + offset * 4)); \
} \
INLINE type##8 vload8(size_t offset, const __private type * ptr) OVERLOADABLE \
{ \
return *((const __private type##8 *)(ptr + offset * 8)); \
} \
INLINE type##16 vload16(size_t offset, const __private type * ptr) OVERLOADABLE \
{ \
return *((const __private type##16 *)(ptr + offset * 16)); \
}
#endif
#ifndef VECTOR_STORE
#define VECTOR_STORE(type) \
INLINE void vstore2(type##2 data, size_t offset, __global type * ptr) OVERLOADABLE \
{ \
*((__global type##2 *)(ptr + offset * 2)) = data; \
} \
INLINE void vstore3(type##3 data, size_t offset, __global type * ptr) OVERLOADABLE \
{ \
vc4cl_vstore3(ptr + offset * 3, data); \
} \
INLINE void vstore4(type##4 data, size_t offset, __global type * ptr) OVERLOADABLE \
{ \
*((__global type##4 *)(ptr + offset * 4)) = data; \
} \
INLINE void vstore8(type##8 data, size_t offset, __global type * ptr) OVERLOADABLE \
{ \
*((__global type##8 *)(ptr + offset * 8)) = data; \
} \
INLINE void vstore16(type##16 data, size_t offset, __global type * ptr) OVERLOADABLE \
{ \
*((__global type##16 *)(ptr + offset * 16)) = data; \
} \
INLINE void vstore2(type##2 data, size_t offset, __local type * ptr) OVERLOADABLE \
{ \
*((__local type##2 *)(ptr + offset * 2)) = data; \
} \
INLINE void vstore3(type##3 data, size_t offset, __local type * ptr) OVERLOADABLE \
{ \
vc4cl_vstore3(ptr + offset * 3, data); \
} \
INLINE void vstore4(type##4 data, size_t offset, __local type * ptr) OVERLOADABLE \
{ \
*((__local type##4 *)(ptr + offset * 4)) = data; \
} \
INLINE void vstore8(type##8 data, size_t offset, __local type * ptr) OVERLOADABLE \
{ \
*((__local type##8 *)(ptr + offset * 8)) = data; \
} \
INLINE void vstore16(type##16 data, size_t offset, __local type * ptr) OVERLOADABLE \
{ \
*((__local type##16 *)(ptr + offset * 16)) = data; \
} \
INLINE void vstore2(type##2 data, size_t offset, __private type * ptr) OVERLOADABLE \
{ \
*((__private type##2 *)(ptr + offset * 2)) = data; \
} \
INLINE void vstore3(type##3 data, size_t offset, __private type * ptr) OVERLOADABLE \
{ \
vc4cl_vstore3(ptr + offset * 3, data); \
} \
INLINE void vstore4(type##4 data, size_t offset, __private type * ptr) OVERLOADABLE \
{ \
*((__private type##4 *)(ptr + offset * 4)) = data; \
} \
INLINE void vstore8(type##8 data, size_t offset, __private type * ptr) OVERLOADABLE \
{ \
*((__private type##8 *)(ptr + offset * 8)) = data; \
} \
INLINE void vstore16(type##16 data, size_t offset, __private type * ptr) OVERLOADABLE \
{ \
*((__private type##16 *)(ptr + offset * 16)) = data; \
}
#endif
#ifndef VECTOR_SHUFFLE_2
#define VECTOR_SHUFFLE_2_INTERNAL(type, maskType, num) \
INLINE type##2 shuffle2(type##num x, type##num y, maskType##2 mask) OVERLOADABLE \
{ \
return __builtin_shufflevector(x, y, mask.x, mask.y); \
} \
INLINE type##4 shuffle2(type##num x, type##num y, maskType##4 mask) OVERLOADABLE \
{ \
return __builtin_shufflevector(x, y, mask.x, mask.y, mask.z, mask.w); \
} \
INLINE type##8 shuffle2(type##num x, type##num y, maskType##8 mask) OVERLOADABLE \
{ \
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7); \
} \
INLINE type##16 shuffle2(type##num x, type##num y, maskType##16 mask) OVERLOADABLE \
{ \
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7, mask.s8, mask.s9, mask.sa, mask.sb, mask.sc, mask.sd, mask.se, mask.sf); \
} \
#define VECTOR_SHUFFLE_2(type, maskType) \
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 2) \
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 4) \
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 8) \
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 16)
#endif
#ifndef VECTOR_SHUFFLE
#define VECTOR_SHUFFLE_INTERNAL(type, maskType, num) \
INLINE type##2 shuffle(type##num val, maskType##2 mask) OVERLOADABLE \
{ \
return shuffle2(val, val, mask); \
} \
INLINE type##4 shuffle(type##num val, maskType##4 mask) OVERLOADABLE \
{ \
return shuffle2(val, val, mask); \
} \
INLINE type##8 shuffle(type##num val, maskType##8 mask) OVERLOADABLE \
{ \
return shuffle2(val, val, mask); \
} \
INLINE type##16 shuffle(type##num val, maskType##16 mask) OVERLOADABLE \
{ \
return shuffle2(val, val, mask); \
} \
#define VECTOR_SHUFFLE(type, maskType) \
VECTOR_SHUFFLE_INTERNAL(type, maskType, 2) \
VECTOR_SHUFFLE_INTERNAL(type, maskType, 4) \
VECTOR_SHUFFLE_INTERNAL(type, maskType, 8) \
VECTOR_SHUFFLE_INTERNAL(type, maskType, 16)
#endif
VECTOR_LOAD(uchar)
VECTOR_LOAD(char)
VECTOR_LOAD(ushort)
VECTOR_LOAD(short)
VECTOR_LOAD(uint)
VECTOR_LOAD(int)
VECTOR_LOAD(float)
VECTOR_LOAD(ulong)
VECTOR_LOAD(long)
VECTOR_STORE(uchar)
VECTOR_STORE(char)
VECTOR_STORE(ushort)
VECTOR_STORE(short)
VECTOR_STORE(uint)
VECTOR_STORE(int)
VECTOR_STORE(float)
VECTOR_STORE(ulong)
VECTOR_STORE(long)
//TODO vload(a)_half, vload(a)_halfn (+rounding) (load half and return converted to float, possible with unpack-modes)
//TODO vstore(a)_half, vstore(a)_halfn (+rounding) (store float as half in memory, possible with pack modes)
/*
* TODO shuffle2, but LLVM fails, since the indices for the __builtin intrinsic need to be constant integers!
VECTOR_SHUFFLE_2(uchar, uchar)
VECTOR_SHUFFLE_2(char, uchar)
VECTOR_SHUFFLE_2(ushort, ushort)
VECTOR_SHUFFLE_2(short, ushort)
VECTOR_SHUFFLE_2(uint, uint)
VECTOR_SHUFFLE_2(int, uint)
VECTOR_SHUFFLE_2(float, uint)
VECTOR_SHUFFLE(uchar, uchar)
VECTOR_SHUFFLE(char, uchar)
VECTOR_SHUFFLE(ushort, ushort)
VECTOR_SHUFFLE(short, ushort)
VECTOR_SHUFFLE(uint, uint)
VECTOR_SHUFFLE(int, uint)
VECTOR_SHUFFLE(float, uint)
*/
//shuffle/shuffle2 are handled via intrinsifying the OpenCL function
#undef VECTOR_LOAD
#undef VECTOR_STORE
#undef VECTOR_SHUFFLE_2_INTERNAL
#undef VECTOR_SHUFFLE_2
#undef VECTOR_SHUFFLE_INTERNAL
#undef VECTOR_SHUFFLE
#endif /* VC4CL_VECTOR_H */

70
drivers/videocore4_stdlib/include/_work_items.h

@ -0,0 +1,70 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_WORK_ITEMS_H
#define VC4CL_WORK_ITEMS_H
#include "_intrinsics.h"
#include "_overloads.h"
INLINE uint get_work_dim(void) OVERLOADABLE CONST
{
return vc4cl_work_dimensions();
}
INLINE size_t get_global_size(uint dim) OVERLOADABLE CONST
{
return vc4cl_global_size(dim);
}
INLINE size_t get_global_id(uint dim) OVERLOADABLE CONST
{
return vc4cl_global_id(dim);
}
INLINE size_t get_local_size(uint dim) OVERLOADABLE CONST
{
return vc4cl_local_size(dim);
}
INLINE size_t get_enqueued_local_size(uint dimindx) OVERLOADABLE CONST
{
// "Returns the same value as that returned by get_local_size(dimindx) if the kernel is executed with a uniform
// work-group size."
return vc4cl_local_size(dimindx);
}
INLINE size_t get_local_id(uint dim) OVERLOADABLE CONST
{
return vc4cl_local_id(dim);
}
INLINE size_t get_num_groups(uint dim) OVERLOADABLE CONST
{
return vc4cl_num_groups(dim);
}
INLINE size_t get_group_id(uint dim) OVERLOADABLE CONST
{
return vc4cl_group_id(dim);
}
INLINE size_t get_global_offset(uint dim) OVERLOADABLE CONST
{
return vc4cl_global_offset(dim);
}
INLINE size_t get_global_linear_id() OVERLOADABLE CONST
{
return vc4cl_global_linear_id();
}
INLINE size_t get_local_linear_id() OVERLOADABLE CONST
{
return vc4cl_local_linear_id();
}
#endif /* VC4CL_WORK_ITEMS_H */

105
drivers/videocore4_stdlib/include/defines.h

@ -0,0 +1,105 @@
/*
* Author: doe300
*
* See the file "LICENSE" for the full license governing this code.
*/
#ifndef VC4CL_DEFINES_H
#define VC4CL_DEFINES_H
#ifndef CL_VERSION_1_0
#define CL_VERSION_1_0 100
#endif
#ifndef CL_VERSION_1_1
#define CL_VERSION_1_1 110
#endif
#ifndef CL_VERSION_1_2
#define CL_VERSION_1_2 120
#endif
#ifndef CL_VERSION_2_0
#define CL_VERSION_2_0 200
#endif
#ifndef CL_VERSION_2_1
#define CL_VERSION_2_1 210
#endif
#ifndef CL_VERSION_2_2
#define CL_VERSION_2_2 220
#endif
#undef __OPENCL_VERSION__
#define __OPENCL_VERSION__ CL_VERSION_1_2
#undef __OPENCL_C_VERSION__
#define __OPENCL_C_VERSION__ CL_VERSION_1_2
#ifndef __ENDIAN_LITTLE__
#define __ENDIAN_LITTLE__ 1
#endif
#ifndef __EMBEDDED_PROFILE__
#define __EMBEDDED_PROFILE__ 1
#endif
//#ifndef __IMAGE_SUPPORT__
//#define __IMAGE_SUPPORT__ 1
//#endif
#undef __IMAGE_SUPPORT__
#ifndef cl_khr_global_int32_base_atomics
#define cl_khr_global_int32_base_atomics
#endif
#ifndef cl_khr_local_int32_base_atomics
#define cl_khr_local_int32_base_atomics
#endif
#ifndef cl_khr_global_int32_extended_atomics
#define cl_khr_global_int32_extended_atomics
#endif
#ifndef cl_khr_local_int32_extended_atomics
#define cl_khr_local_int32_extended_atomics
#endif
#ifndef cl_khr_byte_addressable_store
#define cl_khr_byte_addressable_store
#endif
#ifndef cl_khr_initialize_memory
#define cl_khr_initialize_memory
#endif
#ifdef __IMAGE_SUPPORT__
#ifndef cl_khr_3d_image_writes
#define cl_khr_3d_image_writes
#endif
#ifndef cl_intel_packed_yuv
#define cl_intel_packed_yuv
#endif
#else
#undef cl_khr_3d_image_writes
#undef cl_intel_packed_yuv
#endif
// additional supported extensions (need to set flag here, since the module is loaded too late)
#define cl_nv_pragma_unroll 1
#define cl_arm_core_id 1
#define cl_ext_atomic_counters_32 1
#define cl_arm_integer_dot_product_int8 1
#define cl_arm_integer_dot_product_accumulate_int8 1
#define cl_arm_integer_dot_product_accumulate_int16 1
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
// unsupported extensions or optional core features
#undef cl_khr_fp16
#undef cl_khr_fp64
#undef cl_khr_int64_base_atomics
#undef cl_khr_int64_extended_atomics
#undef cl_khr_depth_images
#undef cl_khr_gl_depth_images
#undef cl_khr_gl_msaa_sharing
#undef cl_amd_media_ops
#undef cl_amd_media_ops2
// unsupported host-only extensions (disable for safety)
#undef cl_khr_gl_sharing
#undef cl_khr_gl_event
#undef cl_khr_d3d10_sharing
#undef cl_khr_dx9_media_sharing
#undef cl_khr_d3d11_sharing
#undef cl_khr_image2d_from_buffer
#undef cl_khr_terminate_context
#undef cl_khr_egl_image
#undef cl_khr_egl_event
#endif /* VC4CL_DEFINES_H */

16914
drivers/videocore4_stdlib/include/opencl-c.h

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save