mirror of https://github.com/Qortal/Brooklyn
Raziel K. Crowe
2 years ago
33 changed files with 28681 additions and 0 deletions
@ -0,0 +1,60 @@
|
||||
cmake_minimum_required (VERSION 3.1) |
||||
|
||||
#### |
||||
# General configuration |
||||
#### |
||||
# Option whether to create deb package |
||||
option(BUILD_DEB_PACKAGE "Enables creating .deb package" ON) |
||||
# Option whether to compile for raspberry-pi (default: ON, for the compatibility) |
||||
option(CROSS_COMPILE "Cross compile for Raspbian" ON) |
||||
option(BUILD_EXPERIMENTAL "Build experimental test program" OFF) |
||||
|
||||
if(NOT BUILD_NUMBER) |
||||
set(BUILD_NUMBER 9999) |
||||
endif() |
||||
|
||||
project(VC4CLStdLib VERSION 0.4.${BUILD_NUMBER}) |
||||
|
||||
|
||||
#Include headers in the project structure |
||||
file( GLOB HDRS "${PROJECT_SOURCE_DIR}/include/*.h") |
||||
add_library(VC4CLStdLib STATIC ${HDRS}) |
||||
set_target_properties(VC4CLStdLib PROPERTIES LINKER_LANGUAGE C) |
||||
|
||||
## |
||||
# Installation targets |
||||
## |
||||
# Adds the public headers to the target, so they are exported |
||||
target_include_directories(VC4CLStdLib PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include/vc4cl-stdlib>) |
||||
# Creates the install target for the headers |
||||
install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/" DESTINATION include/vc4cl-stdlib FILES_MATCHING PATTERN "*.h") |
||||
# Adds custom uninstall command |
||||
add_custom_target(uninstall "${CMAKE_COMMAND}" -P "cmake_uninstall.cmake") |
||||
|
||||
if (BUILD_EXPERIMENTAL) |
||||
add_subdirectory(experimental) |
||||
endif (BUILD_EXPERIMENTAL) |
||||
|
||||
#### |
||||
# Building package |
||||
#### |
||||
if (BUILD_DEB_PACKAGE) |
||||
message(STATUS "build deb package...") |
||||
|
||||
set(CPACK_GENERATOR "DEB") |
||||
set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") |
||||
set(CPACK_PACKAGE_NAME "vc4cl-stdlib") |
||||
string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%d") |
||||
set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}-${BUILD_TIMESTAMP}") |
||||
if (CROSS_COMPILE) |
||||
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "armhf") |
||||
else() |
||||
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64") |
||||
endif() |
||||
set(CPACK_PACKAGE_VENDOR "doe300") |
||||
set(CPACK_PACKAGE_CONTACT "[email protected]") |
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL C headers for the VC4CL implementation (raspberrypi only)") |
||||
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/doe300/VC4CLStdLib") |
||||
set(CPACK_PACKAGE_FILE_NAME "vc4cl-stdlib-0.4-Linux") |
||||
include(CPack) |
||||
endif (BUILD_DEB_PACKAGE) |
@ -0,0 +1,21 @@
|
||||
MIT License |
||||
|
||||
Copyright (c) 2022 |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
@ -0,0 +1,4 @@
|
||||
# VC4CLStdLib |
||||
|
||||
Implementation of the OpenCL standard-library and is required to build the [VC4C] compiler. |
||||
|
@ -0,0 +1,7 @@
|
||||
find_package(OpenCL REQUIRED) |
||||
find_package(Threads REQUIRED) |
||||
|
||||
add_executable(compare_implementations compare_implementations.cpp) |
||||
target_compile_features(compare_implementations PRIVATE cxx_std_14) |
||||
target_compile_options(compare_implementations PRIVATE -g -Og) |
||||
target_link_libraries(compare_implementations OpenCL::OpenCL Threads::Threads) |
@ -0,0 +1,91 @@
|
||||
#define arg_t float16 |
||||
#define result_t float16 |
||||
#define int_t int16 |
||||
#define uint_t uint16 |
||||
|
||||
#define CONCAT(a, b) a##b |
||||
#define CAT(a, b) CONCAT(a, b) |
||||
|
||||
result_t approx_rootn(arg_t x, int_t n) |
||||
{ |
||||
// Divides the exponent by n and emplaces it back into the number |
||||
// Adapted from: https://web.archive.org/web/20131227144655/http://metamerist.com/cbrt/cbrt.htm |
||||
int_t i = CAT(as_, int_t)(x); |
||||
int_t exp = (i - (int_t) (127 << 23)) / n + (int_t) (127 << 23); |
||||
return CAT(as_, result_t)((i & (int_t) 0x807FFFFF) | (exp)); |
||||
} |
||||
|
||||
result_t approx_cbrt(arg_t f) |
||||
{ |
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp |
||||
uint_t p = CAT(as_, uint_t)(f); |
||||
p = p / 3 + 709921077; |
||||
return CAT(as_, result_t)(p); |
||||
} |
||||
|
||||
result_t cbrt_halley_step(arg_t x, arg_t base) |
||||
{ |
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp |
||||
result_t x3 = x * x * x; |
||||
return x * (x3 + base + base) / (x3 + x3 + base); |
||||
} |
||||
|
||||
result_t cbrt_halley(arg_t val) |
||||
{ |
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp |
||||
arg_t abs = fabs(val); |
||||
arg_t approx = approx_rootn(abs, 3); |
||||
|
||||
result_t result = approx; |
||||
#pragma loop unroll |
||||
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy |
||||
{ |
||||
result = cbrt_halley_step(result, val); |
||||
} |
||||
return copysign(result, val); |
||||
} |
||||
|
||||
__kernel void cbrt_halley_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = cbrt_halley(in[gid]); |
||||
} |
||||
|
||||
result_t cbrt_newton_step(arg_t x, arg_t base) |
||||
{ |
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp |
||||
return x - (1.0f / 3.0f) * (x - base / (x * x)); |
||||
} |
||||
|
||||
result_t cbrt_newton(arg_t val) |
||||
{ |
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp |
||||
arg_t abs = fabs(val); |
||||
arg_t approx = approx_cbrt(abs); |
||||
|
||||
result_t result = approx; |
||||
#pragma loop unroll |
||||
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy |
||||
{ |
||||
result = cbrt_newton_step(result, val); |
||||
} |
||||
return copysign(result, val); |
||||
} |
||||
|
||||
__kernel void cbrt_newton_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = cbrt_newton(in[gid]); |
||||
} |
||||
|
||||
__kernel void cbrt_builtin_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = cbrt(in[gid]); |
||||
} |
||||
|
||||
__kernel void cbrt_pow_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = pow(in[gid], 1.0f / 3.0f); |
||||
} |
@ -0,0 +1,404 @@
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 120 |
||||
#define CL_HPP_CL_1_2_DEFAULT_BUILD 1 |
||||
#define CL_HPP_ENABLE_EXCEPTIONS 1 |
||||
#define CL_HPP_TARGET_OPENCL_VERSION 120 |
||||
#define CL_HPP_MINIMUM_OPENCL_VERSION 120 |
||||
#include <CL/cl.hpp> |
||||
|
||||
#include <algorithm> |
||||
#include <chrono> |
||||
#include <cmath> |
||||
#include <cstdlib> |
||||
#include <cstring> |
||||
#include <fstream> |
||||
#include <functional> |
||||
#include <iostream> |
||||
#include <limits> |
||||
#include <random> |
||||
#include <sstream> |
||||
#include <stdexcept> |
||||
#include <string> |
||||
#include <unistd.h> // geteuid() |
||||
#include <vector> |
||||
|
||||
static constexpr uint32_t DEFAULT_NUM_LINEAR = 12 * 16 * 8; |
||||
static constexpr uint32_t DEFAULT_NUM_RANDOM = 12 * 16 * 8; |
||||
|
||||
// VC4CL performance counters
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 10) |
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_IDLE_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 11) |
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL (CL_PROFILING_COMMAND_END + 12) |
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 13) |
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_L2_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 14) |
||||
|
||||
struct Range |
||||
{ |
||||
float min; |
||||
float max; |
||||
}; |
||||
|
||||
struct ReferenceFunction |
||||
{ |
||||
ReferenceFunction(float (*func)(float)) : numParameters(1), ptr(reinterpret_cast<void *>(func)) {} |
||||
ReferenceFunction(float (*func)(float, float)) : numParameters(2), ptr(reinterpret_cast<void *>(func)) {} |
||||
ReferenceFunction(float (*func)(float, float, float)) : numParameters(3), ptr(reinterpret_cast<void *>(func)) {} |
||||
|
||||
float operator()(float val) const |
||||
{ |
||||
if(numParameters != 1) |
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"}; |
||||
return reinterpret_cast<float (*)(float)>(ptr)(val); |
||||
} |
||||
|
||||
float operator()(float val0, float val1) const |
||||
{ |
||||
if(numParameters != 2) |
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"}; |
||||
return reinterpret_cast<float (*)(float, float)>(ptr)(val0, val1); |
||||
} |
||||
|
||||
float operator()(float val0, float val1, float val2) const |
||||
{ |
||||
if(numParameters != 3) |
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"}; |
||||
return reinterpret_cast<float (*)(float, float, float)>(ptr)(val0, val1, val2); |
||||
} |
||||
|
||||
std::vector<float> operator()(const std::vector<std::vector<float>> &inputs) const |
||||
{ |
||||
std::vector<float> out(inputs.front().size()); |
||||
for(std::size_t i = 0; i < out.size(); ++i) |
||||
{ |
||||
if(numParameters == 1) |
||||
out[i] = (*this)(inputs[0][i]); |
||||
if(numParameters == 2) |
||||
out[i] = (*this)(inputs[0][i], inputs[1][i]); |
||||
if(numParameters == 3) |
||||
out[i] = (*this)(inputs[0][i], inputs[1][i], inputs[2][i]); |
||||
} |
||||
return out; |
||||
} |
||||
|
||||
uint8_t numParameters; |
||||
void *ptr; |
||||
}; |
||||
|
||||
struct Test |
||||
{ |
||||
std::string name; |
||||
ReferenceFunction reference; |
||||
uint32_t allowedErrorInUlp; |
||||
std::string sourceFile; |
||||
std::vector<Range> ranges; |
||||
}; |
||||
|
||||
static float identity(float val) |
||||
{ |
||||
return val; |
||||
} |
||||
|
||||
// XXX OpenCL-CTS calculates reference in double, thus is more accurate. So tests being accurate here might not be in
|
||||
// the CTS!
|
||||
static const std::vector<Test> floatTests = { |
||||
Test{"log", logf, 4, "log.cl", |
||||
{ |
||||
{0.5, 1.0}, // reduced range some implementations use
|
||||
{std::numeric_limits<float>::min(), std::numeric_limits<float>::max()} // full range
|
||||
}}, |
||||
Test{"exp", expf, 4, "exp.cl", |
||||
{ |
||||
{0.0, 0.5f * logf(2.0f)}, // reduced range some implementations use
|
||||
{-87.0f /* everything below e^-87 is subnormal */, 89.0f /* everything above e^89 is Inf */} // full range
|
||||
}}, |
||||
Test{"identity", identity, 0, "identity.cl", |
||||
{ |
||||
{-10.0f, 10.0f}, {std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}}, |
||||
Test{"cbrt", cbrtf, 4, "cbrt.cl", |
||||
{ |
||||
{-1.0, 1.0}, // limited range for precision testing
|
||||
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}}, |
||||
Test{"fma", fmaf, 0, "fma.cl", |
||||
{ |
||||
{-100.0f, 100.0f}, // reduced range to not run into NaN/Inf
|
||||
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}}}; |
||||
|
||||
static std::vector<float> generateInputData(const Range &range, uint32_t numLinear, uint32_t numRandom) |
||||
{ |
||||
std::vector<float> data{}; |
||||
data.reserve(numLinear + numRandom); |
||||
auto step = (range.max - range.min) / static_cast<float>(numLinear); // TODO overflows on full ranges
|
||||
for(float val = range.min; val < range.max; val += step) |
||||
data.emplace_back(val); |
||||
|
||||
std::random_device rd{}; |
||||
std::default_random_engine gen(rd()); |
||||
std::uniform_real_distribution<> dist{range.min, range.max}; |
||||
|
||||
while(data.size() < (numLinear + numRandom)) |
||||
data.emplace_back(static_cast<float>(dist(gen))); |
||||
|
||||
return data; |
||||
} |
||||
|
||||
static std::vector<std::vector<float>> generateInputData( |
||||
const Range &range, uint32_t numLinear, uint32_t numRandom, uint8_t numInputs) |
||||
{ |
||||
std::vector<std::vector<float>> data{}; |
||||
for(uint8_t i = 0; i < numInputs; ++i) |
||||
data.emplace_back(generateInputData(range, numLinear, numRandom)); |
||||
return data; |
||||
} |
||||
|
||||
static std::vector<cl::Kernel> createKernels(const cl::Context &context, const Test &test) |
||||
{ |
||||
std::stringstream ss; |
||||
{ |
||||
std::ifstream fis{test.sourceFile}; |
||||
ss << fis.rdbuf(); |
||||
} |
||||
cl::Program program(context, ss.str(), true); |
||||
|
||||
std::vector<cl::Kernel> kernels; |
||||
program.createKernels(&kernels); |
||||
return kernels; |
||||
} |
||||
|
||||
struct ErrorResult |
||||
{ |
||||
std::vector<float> inputValues; |
||||
float expected; |
||||
float actual; |
||||
uint32_t errorInUlp; |
||||
|
||||
// ordered by "most wrong" first
|
||||
bool operator<(const ErrorResult &other) const noexcept |
||||
{ |
||||
if(errorInUlp > other.errorInUlp) |
||||
return true; |
||||
if(errorInUlp < other.errorInUlp) |
||||
return false; |
||||
return inputValues < other.inputValues; |
||||
} |
||||
|
||||
friend std::ostream &operator<<(std::ostream &os, const ErrorResult &error) |
||||
{ |
||||
os << "Error of " << error.errorInUlp << " ULP for "; |
||||
if(error.inputValues.size() == 1) |
||||
os << std::scientific << error.inputValues.front(); |
||||
else if(error.inputValues.size() == 2) |
||||
os << std::scientific << '{' << error.inputValues.front() << ", " << error.inputValues.back() << '}'; |
||||
else if(error.inputValues.size() == 3) |
||||
os << std::scientific << '{' << error.inputValues[0] << ", " << error.inputValues[1] << ", " |
||||
<< error.inputValues[2] << '}'; |
||||
else |
||||
{ |
||||
os << '{'; |
||||
for(auto input : error.inputValues) |
||||
os << std::scientific << input << ", "; |
||||
os << '}'; |
||||
} |
||||
os << ", expected " << error.expected << ", got " << error.actual << std::defaultfloat; |
||||
return os; |
||||
} |
||||
}; |
||||
|
||||
template <typename Out, typename In> |
||||
static Out bit_cast(In val) |
||||
{ |
||||
union
|
||||
{ |
||||
In in; |
||||
Out out; |
||||
} u; |
||||
u.in = val; |
||||
return u.out; |
||||
} |
||||
|
||||
static uint32_t calculateError(float reference, float result, uint32_t allowedErrorInUlp) |
||||
{ |
||||
if(std::isinf(reference) && std::isinf(result) && std::signbit(reference) == std::signbit(result)) |
||||
return 0; |
||||
if(std::isnan(reference) && std::isnan(result)) |
||||
return 0; |
||||
// auto ulp = std::abs(reference * std::numeric_limits<float>::epsilon());
|
||||
// float difference = std::abs(result - reference);
|
||||
// if(difference > static_cast<float>(allowedErrorInUlp))
|
||||
// return static_cast<uint32_t>(std::ceil(difference / ulp));
|
||||
// return 0;
|
||||
return static_cast<uint32_t>(std::abs(bit_cast<int32_t>(reference) - bit_cast<int32_t>(result))); |
||||
} |
||||
|
||||
static std::pair<std::vector<ErrorResult>, uint32_t> checkResults(const std::vector<std::vector<float>> &inputs, |
||||
const std::vector<float> &reference, const std::vector<float> &result, uint32_t allowedErrorInUlp) |
||||
{ |
||||
std::vector<ErrorResult> errors; |
||||
uint32_t maxError = 0; |
||||
|
||||
for(std::size_t i = 0; i < std::min(reference.size(), result.size()); ++i) |
||||
{ |
||||
auto error = calculateError(reference.at(i), result.at(i), allowedErrorInUlp); |
||||
maxError = std::max(maxError, error); |
||||
if(error > allowedErrorInUlp) |
||||
{ |
||||
std::vector<float> errorInputs; |
||||
for(const auto &input : inputs) |
||||
errorInputs.push_back(input.at(i)); |
||||
errors.emplace_back(ErrorResult{std::move(errorInputs), reference.at(i), result.at(i), error}); |
||||
} |
||||
} |
||||
|
||||
std::sort(errors.begin(), errors.end()); |
||||
return std::make_pair(std::move(errors), maxError); |
||||
} |
||||
|
||||
static void runTest( |
||||
const cl::Context &context, const cl::CommandQueue &queue, const Test &test, uint32_t numLinear, uint32_t numRandom) |
||||
{ |
||||
std::cout << "Running test " << test.sourceFile << " ..." << std::endl; |
||||
std::cout << "\tRunning " << test.ranges.size() << " ranges with " << (numLinear + numRandom) << " values" |
||||
<< std::endl; |
||||
auto kernels = createKernels(context, test); |
||||
std::cout << "\tTesting " << kernels.size() << " implementations " << std::endl; |
||||
|
||||
for(const auto &range : test.ranges) |
||||
{ |
||||
auto inputs = generateInputData(range, numLinear, numRandom, test.reference.numParameters); |
||||
auto inputSize = inputs.front().size(); |
||||
cl::NDRange globalSize(inputSize / 16); |
||||
std::vector<float> reference = test.reference(inputs); |
||||
|
||||
std::vector<cl::Buffer> inputBuffers; |
||||
for(auto &input : inputs) |
||||
inputBuffers.emplace_back(queue, input.begin(), input.end(), true); |
||||
cl::Buffer outputBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, inputSize * sizeof(float)); |
||||
|
||||
for(auto &kernel : kernels) |
||||
{ |
||||
kernel.setArg(0, outputBuffer); |
||||
for(std::size_t i = 0; i < inputBuffers.size(); ++i) |
||||
kernel.setArg(1 + i, inputBuffers[i]); |
||||
|
||||
std::cout << "\tRunning kernel '" << kernel.getInfo<CL_KERNEL_FUNCTION_NAME>() << "' with " |
||||
<< (inputSize / 16) << " work-items ... " << std::endl; |
||||
auto start = std::chrono::steady_clock::now(); |
||||
cl::Event kernelEvent{}; |
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, cl::NullRange, nullptr, &kernelEvent); |
||||
kernelEvent.wait(); |
||||
auto end = std::chrono::steady_clock::now(); |
||||
std::cout << "\t- Finished in " |
||||
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us" |
||||
<< std::endl; |
||||
std::chrono::nanoseconds deviceDuration{kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() - |
||||
kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()}; |
||||
std::cout << "\t- Executed for " |
||||
<< std::chrono::duration_cast<std::chrono::microseconds>(deviceDuration).count() << " us" |
||||
<< std::endl; |
||||
if(geteuid() == 0) // TODO only on hardware
|
||||
{ |
||||
cl_ulong numInstructions = 0; |
||||
kernelEvent.getProfilingInfo( |
||||
CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL, &numInstructions); |
||||
cl_ulong numCycles = 0; |
||||
kernelEvent.getProfilingInfo(CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL, &numCycles); |
||||
std::cout << "\t- Executed " << numInstructions << " instructions in " << numCycles << " cycles" |
||||
<< std::endl; |
||||
} |
||||
|
||||
std::vector<float> result(inputSize); |
||||
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, inputSize * sizeof(float), result.data()); |
||||
auto errors = checkResults(inputs, reference, result, test.allowedErrorInUlp); |
||||
std::cout << "\t- Has " << errors.first.size() << " wrong results and a maximum error of " << errors.second |
||||
<< " ULP (of allowed " << test.allowedErrorInUlp << " ULP)" << std::endl; |
||||
for(std::size_t i = 0; i < std::min(errors.first.size(), std::size_t{8}); ++i) |
||||
std::cout << "\t\t" << errors.first[i] << std::endl; |
||||
if(errors.first.size() > 8) |
||||
std::cout << "\t\t[...]" << std::endl; |
||||
} |
||||
} |
||||
} |
||||
|
||||
static void printHelp() |
||||
{ |
||||
std::cout << "Usage: <program> [<options>] <test> [<test>...]" << std::endl; |
||||
std::cout << "Options: " << std::endl; |
||||
std::cout << "\t--help Shows this help message" << std::endl; |
||||
std::cout << "\t--linear=<num> Specifies the number of linear test values, defaults to " << DEFAULT_NUM_LINEAR |
||||
<< std::endl; |
||||
std::cout << "\t--random=<num> Specifies the number of random test values, defaults to " << DEFAULT_NUM_RANDOM |
||||
<< std::endl; |
||||
std::cout << "Available tests: "; |
||||
for(const auto &test : floatTests) |
||||
std::cout << test.name << ", "; |
||||
std::cout << std::endl; |
||||
} |
||||
|
||||
int main(int argc, char **argv) |
||||
{ |
||||
uint32_t numLinear = DEFAULT_NUM_LINEAR; |
||||
uint32_t numRandom = DEFAULT_NUM_RANDOM; |
||||
|
||||
if(argc < 2) |
||||
{ |
||||
printHelp(); |
||||
return EXIT_SUCCESS; |
||||
} |
||||
|
||||
auto platform = cl::Platform::get(); |
||||
cl::Device device{}; |
||||
{ |
||||
std::vector<cl::Device> devices; |
||||
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); |
||||
if(devices.empty()) |
||||
{ |
||||
std::cout << "No device found!" << std::endl; |
||||
return EXIT_FAILURE; |
||||
} |
||||
device = devices.front(); |
||||
} |
||||
cl::Context context(device); |
||||
cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE); |
||||
|
||||
std::vector<std::reference_wrapper<const Test>> selectedTests; |
||||
for(int i = 1; i < argc; ++i) |
||||
{ |
||||
if(argv[i][0] == '-') |
||||
{ |
||||
if(std::string{"--help"} == argv[i]) |
||||
{ |
||||
printHelp(); |
||||
return EXIT_SUCCESS; |
||||
} |
||||
else if(strstr(argv[i], "--linear=") == argv[i]) |
||||
numLinear = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--linear="))); |
||||
else if(strstr(argv[i], "--random=") == argv[i]) |
||||
numRandom = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--random="))); |
||||
else |
||||
{ |
||||
std::cout << "Unknown option: " << argv[i] << std::endl; |
||||
printHelp(); |
||||
return EXIT_FAILURE; |
||||
} |
||||
} |
||||
auto testIt = |
||||
std::find_if(floatTests.begin(), floatTests.end(), [&](const Test &test) { return test.name == argv[i]; }); |
||||
if(testIt != floatTests.end()) |
||||
selectedTests.emplace_back(std::cref(*testIt)); |
||||
else |
||||
{ |
||||
std::cout << "No such test '" << argv[i] << "', available tests: "; |
||||
for(const auto &test : floatTests) |
||||
std::cout << test.name << ", "; |
||||
std::cout << std::endl; |
||||
return EXIT_FAILURE; |
||||
} |
||||
} |
||||
|
||||
for(const auto &test : selectedTests) |
||||
runTest(context, queue, test.get(), numLinear, numRandom); |
||||
|
||||
return EXIT_SUCCESS; |
||||
} |
@ -0,0 +1,364 @@
|
||||
|
||||
#define arg_t float16 |
||||
#define result_t float16 |
||||
#define int_t int16 |
||||
|
||||
#define CONCAT(a, b) a##b |
||||
#define CAT(a, b) CONCAT(a, b) |
||||
|
||||
// vc4cl_split(double) of M_LN2 |
||||
#define M_LN2_FF 0xB102E3083F317218 |
||||
|
||||
float16 vc4cl_lossy(ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable)); |
||||
|
||||
result_t pow2(int_t val) |
||||
{ |
||||
// y = 2^x = 1.0 [implied] * 2^(x + offset) |
||||
int_t tmp = val << 23; |
||||
// alternative: tmp = (val + 127) << 23; |
||||
tmp += (int_t) 0x3F800000; |
||||
return CAT(as_, result_t)(tmp & (int_t) 0x7F800000); |
||||
} |
||||
|
||||
int_t powerOfTwo(arg_t val) |
||||
{ |
||||
// Original code, produces Inf for e^(~10^38) |
||||
// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f)); |
||||
// Using floor() instead of ceil(), |
||||
// - fixes Inf for large exponents |
||||
// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP), |
||||
// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations |
||||
return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f)); |
||||
} |
||||
|
||||
/* |
||||
* Taylor series with Horner's method and range reduction, |
||||
* |
||||
* https://www.pseudorandom.com/implementing-exp#section-6 |
||||
*/ |
||||
result_t exp_taylor(arg_t val) |
||||
{ |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F; |
||||
|
||||
arg_t tk = 1.0f; |
||||
arg_t tn = 1.0f; |
||||
#pragma loop unroll |
||||
for(int i = 1; i < 10; i++) // TODO can adjust number of iterations |
||||
{ |
||||
tk *= r / i; |
||||
tn += tk; |
||||
}; |
||||
|
||||
tn = tn * pow2(k); |
||||
return val < 0 ? 1 / tn : tn; |
||||
} |
||||
|
||||
__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = exp_taylor(in[gid]); |
||||
} |
||||
|
||||
result_t exp_taylor_extended_precision_exact(arg_t val) |
||||
{ |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF)); |
||||
|
||||
ulong16 tk = 0x000000003F800000; // 1.0 |
||||
ulong16 tn = 0x000000003F800000; // 1.0 |
||||
|
||||
tk = vc4cl_mul(tk, r); |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8 |
||||
tn = vc4cl_add(tn, tk); |
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9 |
||||
tn = vc4cl_add(tn, tk); |
||||
// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP) |
||||
|
||||
result_t result = vc4cl_lossy(tn) * pow2(k); |
||||
return val < 0 ? 1.0f / result : result; |
||||
} |
||||
|
||||
// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in) |
||||
// { |
||||
// uint gid = get_global_id(0); |
||||
// out[gid] = exp_taylor_extended_precision_exact(in[gid]); |
||||
// } |
||||
|
||||
// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp |
||||
|
||||
/* |
||||
* Chebyshev interpolation with range reduction, |
||||
* |
||||
* https://www.pseudorandom.com/implementing-exp#section-18 |
||||
*/ |
||||
result_t exp_chebyshev(arg_t val) |
||||
{ |
||||
// XXX could remove unneeded coefficients once we fix precision |
||||
const float coefficients[] = { |
||||
1.266065877752008335598244625214717537923, |
||||
1.130318207984970054415392055219726613610, |
||||
0.2714953395340765623657051399899818507081, |
||||
0.04433684984866380495257149525979922986386, |
||||
0.00547424044209373265027616843118645948703, |
||||
0.000542926311913943750362147810307554678760, |
||||
0.00004497732295429514665469032811091269841937, |
||||
3.198436462401990505863872976602295688795e-6, |
||||
1.992124806672795725961064384805589035648e-7, |
||||
1.103677172551734432616996091335324170860e-8, |
||||
5.50589607967374725047142040200552692791e-10, |
||||
2.497956616984982522712010934218766985311e-11, |
||||
1.039152230678570050499634672423840849837e-12, |
||||
3.991263356414401512887720401532162026594e-14, |
||||
}; |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F; |
||||
|
||||
arg_t ti = 1.0f; |
||||
arg_t tj = r; |
||||
arg_t p = coefficients[0] + (coefficients[1] * r); |
||||
#pragma loop unroll |
||||
for(int i = 2; i < 8; i++) // TODO can adjust number of iterations |
||||
{ |
||||
arg_t tk = (2 * r * tj) - ti; |
||||
p += coefficients[i] * tk; |
||||
ti = tj; |
||||
tj = tk; |
||||
} |
||||
|
||||
p = p * pow2(k); |
||||
return val < 0 ? 1 / p : p; |
||||
} |
||||
|
||||
__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = exp_chebyshev(in[gid]); |
||||
} |
||||
|
||||
result_t exp_chebyshev_extended_precision_exact(arg_t val) |
||||
{ |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF)); |
||||
|
||||
ulong16 ti = 0x000000003F800000; // 1.0 |
||||
ulong16 tj = r; |
||||
// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610 |
||||
ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r)); |
||||
r = vc4cl_mul(r, 0x0000000040000000); // 2.0 |
||||
|
||||
ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6 |
||||
ti = tj; |
||||
tj = tk; |
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti); |
||||
p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7 |
||||
ti = tj; |
||||
tj = tk; |
||||
// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP) |
||||
|
||||
result_t result = vc4cl_lossy(p) * pow2(k); |
||||
return val < 0 ? 1.0f / result : result; |
||||
} |
||||
|
||||
// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in) |
||||
// { |
||||
// uint gid = get_global_id(0); |
||||
// out[gid] = exp_chebyshev_extended_precision_exact(in[gid]); |
||||
// } |
||||
|
||||
/* |
||||
* Chebyshev interpolation with monomial basis and range reduction, |
||||
* |
||||
* https://www.pseudorandom.com/implementing-exp#section-18 |
||||
*/ |
||||
result_t exp_chebyshev_monomial(arg_t val) |
||||
{ |
||||
// XXX could remove unneeded coefficients once we fix precision |
||||
// TODO invert order of coefficients and traversal ?! |
||||
const float coefficients[] = { |
||||
1.000000000000000, |
||||
1.000000000000000, |
||||
0.500000000000002, |
||||
0.166666666666680, |
||||
0.041666666666727, |
||||
0.008333333333342, |
||||
0.001388888888388, |
||||
1.984126978734782e-4, |
||||
2.480158866546844e-5, |
||||
2.755734045527853e-6, |
||||
2.755715675968011e-7, |
||||
2.504861486483735e-8, |
||||
2.088459690899721e-9, |
||||
1.632461784798319e-10, |
||||
}; |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F; |
||||
|
||||
arg_t pn = 1.143364767943110e-11; |
||||
#pragma loop unroll |
||||
for(int i = 0; i < 14; i++) |
||||
{ |
||||
pn = pn * r + coefficients[13 - i]; |
||||
} |
||||
|
||||
pn = pn * pow2(k); |
||||
return val < 0 ? 1 / pn : pn; |
||||
} |
||||
|
||||
__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = exp_chebyshev_monomial(in[gid]); |
||||
} |
||||
|
||||
result_t exp_chebyshev_monomial_exact(arg_t val) |
||||
{ |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
arg_t kFloat = CAT(convert_, arg_t)(k); |
||||
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF))); |
||||
|
||||
arg_t pn = 1.143364767943110e-11; |
||||
|
||||
pn = pn * r + 1.632461784798319e-10f; |
||||
pn = pn * r + 2.088459690899721e-9f; |
||||
pn = pn * r + 2.504861486483735e-8f; |
||||
pn = pn * r + 2.755715675968011e-7f; |
||||
pn = pn * r + 2.755734045527853e-6f; |
||||
pn = pn * r + 2.480158866546844e-5f; |
||||
pn = pn * r + 1.984126978734782e-4f; |
||||
pn = pn * r + 0.001388888888388f; |
||||
pn = pn * r + 0.008333333333342f; |
||||
pn = pn * r + 0.041666666666727f; |
||||
pn = pn * r + 0.166666666666680f; |
||||
pn = pn * r + 0.500000000000002f; |
||||
pn = pn * r + 1.000000000000000f; |
||||
pn = pn * r + 1.000000000000000f; |
||||
|
||||
pn = pn * pow2(k); |
||||
return val < 0 ? 1 / pn : pn; |
||||
} |
||||
|
||||
__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = exp_chebyshev_monomial_exact(in[gid]); |
||||
} |
||||
|
||||
result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val) |
||||
{ |
||||
arg_t positive = fabs(val); |
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)] |
||||
int_t k = powerOfTwo(positive); |
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF)); |
||||
|
||||
ulong16 pn = 0x209249252D492492; // 1.143364767943110e-11 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000 |
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000 |
||||
|
||||
result_t result = vc4cl_lossy(pn) * pow2(k); |
||||
return val < 0 ? 1.0f / result : result; |
||||
} |
||||
|
||||
// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in) |
||||
// { |
||||
// uint gid = get_global_id(0); |
||||
// out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]); |
||||
// } |
||||
|
||||
// TODO Remes from www.netlib.org/fdlibm/e_exp.c |
||||
|
||||
// TODO Matters computational (sections 32.2.2.2 and 32.2.3) |
||||
// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4) |
||||
|
||||
// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1 |
||||
// TODO http://www.netlib.org/fdlibm/ |
||||
|
||||
__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = exp(in[gid]); |
||||
} |
@ -0,0 +1,46 @@
|
||||
#define arg_t float16 |
||||
#define result_t float16 |
||||
#define int_t int16 |
||||
#define uint_t uint16 |
||||
|
||||
#define CONCAT(a, b) a##b |
||||
#define CAT(a, b) CONCAT(a, b) |
||||
|
||||
float16 vc4cl_lossy(ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable)); |
||||
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable)); |
||||
|
||||
result_t fma_simple(arg_t in0, arg_t in1, arg_t in2) |
||||
{ |
||||
return in0 * in1 * in2; |
||||
} |
||||
|
||||
__kernel void fma_simple_kernel( |
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = fma_simple(in0[gid], in1[gid], in2[gid]); |
||||
} |
||||
|
||||
result_t fma_extended_precision(arg_t in0, arg_t in1, arg_t in2) |
||||
{ |
||||
ulong16 mul = vc4cl_mul(in0, in1); |
||||
return vc4cl_lossy(vc4cl_add(mul, vc4cl_extend(in2))); |
||||
} |
||||
|
||||
__kernel void fma_extended_precision_kernel( |
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = fma_extended_precision(in0[gid], in1[gid], in2[gid]); |
||||
} |
||||
|
||||
__kernel void fma_builtin_kernel( |
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = fma(in0[gid], in1[gid], in2[gid]); |
||||
} |
@ -0,0 +1,9 @@
|
||||
#define arg_t float16 |
||||
#define result_t float16 |
||||
#define int_t int16 |
||||
|
||||
__kernel void identity_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = in[gid]; |
||||
} |
@ -0,0 +1,256 @@
|
||||
#define arg_t float16 |
||||
#define result_t float16 |
||||
#define int_t int16 |
||||
|
||||
/* |
||||
* Helper, arithmetic-geometric-mean, |
||||
* |
||||
* https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean |
||||
*/ |
||||
result_t agm(arg_t x, arg_t y) |
||||
{ |
||||
arg_t arithm = x; |
||||
arg_t geom = y; |
||||
for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations |
||||
{ |
||||
arg_t arithm_new = (arithm + geom) / (arg_t) 2.0; |
||||
geom = sqrt(arithm * geom); |
||||
arithm = arithm_new; |
||||
} |
||||
return arithm; |
||||
} |
||||
|
||||
#define CONCAT(a, b) a##b |
||||
#define CAT(a, b) CONCAT(a, b) |
||||
|
||||
#define REDUCE_ARGUMENT_TO_0_1 \ |
||||
/* log(S * M * 2^E) = log(S * M) + E log(2) */ \ |
||||
int_t bitcast = CAT(as_, int_t)(val); \ |
||||
/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */ \ |
||||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126; \ |
||||
/* mask off exponent and replace with exponent for range [0.5, 1) */ \ |
||||
int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000; \ |
||||
arg_t mantissa = CAT(as_, result_t)(signedMantissaBits); \ |
||||
result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F; |
||||
|
||||
/* |
||||
* Taylor-series, |
||||
* |
||||
* https://en.wikipedia.org/wiki/Mercator_series |
||||
*/ |
||||
result_t log1p_taylor(arg_t val) |
||||
{ |
||||
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4 |
||||
// converges for -1 < x <= 1 (requires argument reduction) |
||||
|
||||
REDUCE_ARGUMENT_TO_0_1 |
||||
|
||||
// iteration 1 |
||||
result_t result = mantissa; |
||||
arg_t power = mantissa; |
||||
#pragma loop unroll |
||||
for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations |
||||
{ |
||||
power *= mantissa; |
||||
arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0; |
||||
result = result + sign * (arg_t) (1.0 / iteration) * power; |
||||
} |
||||
return result + reduced; |
||||
} |
||||
|
||||
__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f); |
||||
} |
||||
|
||||
result_t log1p_taylor_unrolled(arg_t val) |
||||
{ |
||||
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4 |
||||
// converges for -1 < x <= 1 (requires argument reduction) |
||||
|
||||
REDUCE_ARGUMENT_TO_0_1 |
||||
|
||||
// iteration 1 |
||||
result_t result = mantissa; |
||||
arg_t power = mantissa; |
||||
|
||||
// iteration 2 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 2.0) * power; |
||||
|
||||
// iteration 3 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 3.0) * power; |
||||
|
||||
// iteration 4 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 4.0) * power; |
||||
|
||||
// iteration 5 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 5.0) * power; |
||||
|
||||
// iteration 6 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 6.0) * power; |
||||
|
||||
// iteration 7 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 7.0) * power; |
||||
|
||||
// iteration 8 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 8.0) * power; |
||||
|
||||
// iteration 9 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 9.0) * power; |
||||
|
||||
// iteration 10 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 10.0) * power; |
||||
|
||||
// iteration 11 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 11.0) * power; |
||||
|
||||
// iteration 12 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 12.0) * power; |
||||
|
||||
// iteration 13 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 13.0) * power; |
||||
|
||||
// iteration 14 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 14.0) * power; |
||||
|
||||
// iteration 15 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 15.0) * power; |
||||
|
||||
// iteration 16 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 16.0) * power; |
||||
|
||||
// iteration 17 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 17.0) * power; |
||||
|
||||
// iteration 18 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 18.0) * power; |
||||
|
||||
// iteration 19 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 19.0) * power; |
||||
|
||||
// iteration 20 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 20.0) * power; |
||||
|
||||
// iteration 21 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 21.0) * power; |
||||
|
||||
// iteration 22 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 22.0) * power; |
||||
|
||||
// iteration 23 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 23.0) * power; |
||||
|
||||
// iteration 24 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 24.0) * power; |
||||
|
||||
// iteration 25 |
||||
power *= mantissa; |
||||
result = result + (arg_t) (1.0 / 25.0) * power; |
||||
|
||||
// iteration 26 |
||||
power *= mantissa; |
||||
result = result - (arg_t) (1.0 / 26.0) * power; |
||||
|
||||
// TODO can adjust number of iterations |
||||
|
||||
return result + reduced; |
||||
} |
||||
|
||||
__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f); |
||||
} |
||||
|
||||
/* |
||||
* Taylor series with optimization, requires argument reduction, |
||||
* |
||||
* https://math.stackexchange.com/a/3383716 |
||||
*/ |
||||
result_t log_taylor(arg_t val) |
||||
{ |
||||
REDUCE_ARGUMENT_TO_0_1 |
||||
|
||||
result_t result = 0; |
||||
// iteration 1 |
||||
arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0); |
||||
arg_t factor = tmp * tmp; |
||||
#pragma loop unroll |
||||
for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations |
||||
{ |
||||
result += tmp / (arg_t) iteration; |
||||
tmp *= factor; |
||||
} |
||||
return result + reduced; |
||||
} |
||||
|
||||
__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log_taylor(in[gid]); |
||||
} |
||||
|
||||
/* |
||||
* Using the arithmetic-geometric-mean, |
||||
* |
||||
* https://en.wikipedia.org/wiki/Natural_logarithm#High_precision |
||||
*/ |
||||
result_t log_agm(arg_t val) |
||||
{ |
||||
const unsigned m = 8; // TODO can adjust for precision |
||||
arg_t s = val * (arg_t) (1 << m); |
||||
arg_t mean = agm(1.0, (arg_t) 4.0 / s); |
||||
return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2); |
||||
} |
||||
|
||||
__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log_agm(in[gid]); |
||||
} |
||||
|
||||
result_t log_agm_reduced(arg_t val) |
||||
{ |
||||
REDUCE_ARGUMENT_TO_0_1 |
||||
|
||||
const unsigned m = 8; // TODO can adjust for precision |
||||
arg_t s = mantissa * (arg_t) (1 << m); |
||||
arg_t mean = agm(1.0, (arg_t) 4.0 / s); |
||||
return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced; |
||||
} |
||||
|
||||
__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log_agm_reduced(in[gid]); |
||||
} |
||||
|
||||
__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in) |
||||
{ |
||||
uint gid = get_global_id(0); |
||||
out[gid] = log(in[gid]); |
||||
} |
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* General header for the VC4CLStdlib implementation, contains all required headers |
||||
* |
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CLSTDLIB_H |
||||
#define VC4CLSTDLIB_H |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" |
||||
{ |
||||
#endif |
||||
|
||||
#include "_config.h" |
||||
#include "_extensions.h" |
||||
#include "_conversions.h" |
||||
#include "_common.h" |
||||
#include "_math.h" |
||||
#include "_integer.h" |
||||
#include "_geometric.h" |
||||
#include "_relational.h" |
||||
#include "_work_items.h" |
||||
#include "_vector.h" |
||||
#include "_synchronization.h" |
||||
#include "_async.h" |
||||
#include "_atomics.h" |
||||
#include "_images.h" |
||||
#include "_printf.h" |
||||
#include "_spir_mangling.h" |
||||
#include "_clcxx_mangling.h" |
||||
|
||||
#undef ALL_BITS_SET |
||||
#undef OVERLOADABLE |
||||
#undef CONST |
||||
#undef PURE |
||||
#undef INLINE |
||||
#undef FUNC_1 |
||||
#undef OVERLOAD_1 |
||||
#undef OVERLOAD_1_RETURN_SCALAR |
||||
#undef FUNC_2 |
||||
#undef OVERLOAD_2 |
||||
#undef OVERLOAD_2_SCALAR |
||||
#undef OVERLOAD_2_RETURN_SCALAR |
||||
#undef OVERLOAD_2_SCALAR_RETURN_SCALAR |
||||
#undef FUNC_3 |
||||
#undef OVERLOAD_3 |
||||
#undef OVERLOAD_3_SCALAR |
||||
#undef FUNC_4 |
||||
#undef FUNC_5 |
||||
#undef SIMPLE_1 |
||||
#undef SIMPLE_1_RETURN_SCALAR |
||||
#undef SIMPLE_2 |
||||
#undef SIMPLE_2_RETURN_SCALAR |
||||
#undef SIMPLE_2_SCALAR |
||||
#undef SIMPLE_3 |
||||
#undef SIMPLE_3_SCALAR |
||||
#undef SIMPLE_3_TWO_SCALAR |
||||
#undef COMPLEX_1 |
||||
#undef COMPLEX_1_RETURN_SCALAR |
||||
#undef COMPLEX_2 |
||||
#undef COMPLEX_3 |
||||
#undef COMPLEX_3_SCALAR |
||||
#undef OVERLOAD_ALL_IMAGE_TYPES |
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_1 |
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_2 |
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_3 |
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_4 |
||||
|
||||
#ifdef __cplusplus |
||||
} |
||||
#endif |
||||
|
||||
#endif /* VC4CLSTDLIB_H */ |
||||
|
@ -0,0 +1,245 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_ASYNC_H |
||||
#define VC4CL_ASYNC_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
|
||||
|
||||
/*
|
||||
* This is a synchronous/blocking implementation. |
||||
* The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area. |
||||
* Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item |
||||
* (index 0, 0, 0). Idea taken from PoCL |
||||
*/ |
||||
|
||||
#define ASYNC_COPY_INTERNAL \ |
||||
if(vc4cl_local_id(0) == 0) \
|
||||
{ \
|
||||
vc4cl_mutex_lock(); \
|
||||
vc4cl_dma_copy(dst, src, num_elements); \
|
||||
vc4cl_mutex_unlock(); \
|
||||
} |
||||
|
||||
#define ASYNC_COPY(type) \ |
||||
INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} |
||||
|
||||
#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \ |
||||
for (size_t i = 0; i < num_elements; ++i) \
|
||||
dst[i] = src[i * src_stride]; |
||||
//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?
|
||||
|
||||
#define ASYNC_STRIDED_DEST_COPY_INTERNAL \ |
||||
for (size_t i = 0; i < num_elements; ++i) \
|
||||
dst[i * dst_stride] = src[i]; |
||||
|
||||
#define ASYNC_STRIDED_COPY(type) \ |
||||
INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} |
||||
|
||||
#define PREFETCH(type) \ |
||||
INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} |
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 278: |
||||
* "Perform an async copy of num_gentypes gentype elements from src to dst. |
||||
* The async copy is performed by all work-items in a work-group and this built-in |
||||
* function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values." |
||||
*/ |
||||
ASYNC_COPY(uchar) |
||||
ASYNC_COPY(char) |
||||
ASYNC_COPY(ushort) |
||||
ASYNC_COPY(short) |
||||
ASYNC_COPY(uint) |
||||
ASYNC_COPY(int) |
||||
ASYNC_COPY(float) |
||||
|
||||
ASYNC_STRIDED_COPY(uchar) |
||||
ASYNC_STRIDED_COPY(char) |
||||
ASYNC_STRIDED_COPY(ushort) |
||||
ASYNC_STRIDED_COPY(short) |
||||
ASYNC_STRIDED_COPY(uint) |
||||
ASYNC_STRIDED_COPY(int) |
||||
ASYNC_STRIDED_COPY(float) |
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 279: |
||||
* "Wait for events that identify the async_work_group_copy operations to complete. |
||||
* The event objects specified in event_list will be released after the wait is performed." |
||||
*/ |
||||
INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE |
||||
{ |
||||
// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
|
||||
// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
|
||||
barrier(CLK_GLOBAL_MEM_FENCE); |
||||
} |
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 280: |
||||
* "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache. |
||||
* The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel." |
||||
* |
||||
* -> Since it doesn't affect the functional behavior, the implementation is a no-op |
||||
*/ |
||||
PREFETCH(uchar) |
||||
PREFETCH(char) |
||||
PREFETCH(ushort) |
||||
PREFETCH(short) |
||||
PREFETCH(uint) |
||||
PREFETCH(int) |
||||
PREFETCH(float) |
||||
|
||||
#undef ASYNC_COPY_INTERNAL |
||||
#undef ASYNC_COPY |
||||
#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL |
||||
#undef ASYNC_STRIDED_DEST_COPY_INTERNAL |
||||
#undef ASYNC_STRIDED_COPY |
||||
#undef PREFETCH |
||||
|
||||
#endif /* VC4CL_ASYNC_H */ |
||||
|
@ -0,0 +1,659 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_ATOMICS_H |
||||
#define VC4CL_ATOMICS_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
#include "_intrinsics.h" |
||||
|
||||
INLINE int atomic_add(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_add(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_add(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_add(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_add(volatile __global int *ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_add(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_add(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_add(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_add(volatile __local int *ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_add(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_add(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_add(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_sub(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_sub(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_sub(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_sub(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_sub(volatile __global int *ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_sub(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_sub(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_sub(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_sub(volatile __local int *ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_sub(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_sub(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_sub(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_xchg(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE float atomic_xchg(volatile __global float * ptr, float val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
float old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_xchg(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE float atomic_xchg(volatile __local float * ptr, float val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
float old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_xchg(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE float atom_xchg(volatile __global float * ptr, float val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_xchg(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE float atom_xchg(volatile __local float * ptr, float val) OVERLOADABLE |
||||
{ |
||||
return atomic_xchg(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_inc(volatile __global int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_inc(volatile __global unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_inc(volatile __local int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_inc(volatile __local unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old + 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_inc(volatile __global int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_inc(ptr); |
||||
} |
||||
|
||||
INLINE unsigned int atom_inc(volatile __global unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_inc(ptr); |
||||
} |
||||
|
||||
INLINE int atom_inc(volatile __local int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_inc(ptr); |
||||
} |
||||
|
||||
INLINE unsigned int atom_inc(volatile __local unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_inc(ptr); |
||||
} |
||||
|
||||
INLINE int atomic_dec(volatile __global int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_dec(volatile __global unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_dec(volatile __local int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_dec(volatile __local unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old - 1); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_dec(volatile __global int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_dec(ptr); |
||||
} |
||||
|
||||
INLINE unsigned int atom_dec(volatile __global unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_dec(ptr); |
||||
} |
||||
|
||||
INLINE int atom_dec(volatile __local int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_dec(ptr); |
||||
} |
||||
|
||||
INLINE unsigned int atom_dec(volatile __local unsigned int * ptr) OVERLOADABLE |
||||
{ |
||||
return atomic_dec(ptr); |
||||
} |
||||
|
||||
INLINE int atomic_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_cmpxchg(ptr, compare, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_cmpxchg(ptr, compare, val); |
||||
} |
||||
|
||||
INLINE int atom_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_cmpxchg(ptr, compare, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_cmpxchg(ptr, compare, val); |
||||
} |
||||
|
||||
INLINE int atomic_min(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, min(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, min(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_min(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, min(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, min(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_min(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_min(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_min(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_min(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_min(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_min(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_max(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, max(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, max(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_max(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, max(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, max(old, val)); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_max(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_max(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_max(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_max(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_max(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_max(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_and(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old & val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old & val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_and(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old & val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old & val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_and(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_and(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_and(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_and(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_and(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_and(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_or(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old | val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old | val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_or(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old | val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old | val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_or(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_or(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_or(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_or(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_or(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_or(ptr, val); |
||||
} |
||||
|
||||
INLINE int atomic_xor(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old ^ val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old ^ val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atomic_xor(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old ^ val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE unsigned int atomic_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
vc4cl_mutex_lock(); |
||||
unsigned int old = vc4cl_dma_read(ptr); |
||||
vc4cl_dma_write(ptr, old ^ val); |
||||
vc4cl_mutex_unlock(); |
||||
return old; |
||||
} |
||||
|
||||
INLINE int atom_xor(volatile __global int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xor(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xor(ptr, val); |
||||
} |
||||
|
||||
INLINE int atom_xor(volatile __local int * ptr, int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xor(ptr, val); |
||||
} |
||||
|
||||
INLINE unsigned int atom_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE |
||||
{ |
||||
return atomic_xor(ptr, val); |
||||
} |
||||
|
||||
#endif /* VC4CL_ATOMICS_H */ |
||||
|
@ -0,0 +1,411 @@
|
||||
/*
|
||||
* OpenCL 2.0 introduces the __generic address space, which is also used by C++ for OpenCL C. |
||||
* |
||||
* Since we do not actually care about address spaces(so far), we can just map those functions to one of the existing address spaces. |
||||
* |
||||
* Base list of affected functions generated with: |
||||
* llvm-dis -o /dev/stdout ../VC4CLStdLib/include/VC4CLStdLib.bc | grep -oE 'spir_func .?* \S*AS1.*?\)' | sort |
||||
* |
||||
* This header contains wrapper for the SPIR-mangled functions to the real implementations |
||||
*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
#ifndef VC4CL_GENERIC_MANGLING |
||||
#define VC4CL_GENERIC_MANGLING |
||||
|
||||
#include "_config.h" |
||||
|
||||
float _Z4modffPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z4modffPU3AS1f"))); |
||||
float _Z5fractfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z5fractfPU3AS1f"))); |
||||
float _Z5frexpfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z5frexpfPU3AS1i"))); |
||||
float _Z6remquoffPU3AS4i(float, float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6remquoffPU3AS1i"))); |
||||
float _Z6sincosfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6sincosfPU3AS1f"))); |
||||
float _Z8lgamma_rfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8lgamma_rfPU3AS1i"))); |
||||
float2 _Z4modfDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z4modfDv2_fPU3AS1S_"))); |
||||
float2 _Z5fractDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z5fractDv2_fPU3AS1S_"))); |
||||
float2 _Z5frexpDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z5frexpDv2_fPU3AS1Dv2_i"))); |
||||
float2 _Z6remquoDv2_fS_PU3AS4Dv2_i(float2, float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z6remquoDv2_fS_PU3AS1Dv2_i"))); |
||||
float2 _Z6sincosDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z6sincosDv2_fPU3AS1S_"))); |
||||
float2 _Z8lgamma_rDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z8lgamma_rDv2_fPU3AS1Dv2_i"))); |
||||
float3 _Z4modfDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z4modfDv3_fPU3AS1S_"))); |
||||
float3 _Z5fractDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z5fractDv3_fPU3AS1S_"))); |
||||
float3 _Z5frexpDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z5frexpDv3_fPU3AS1Dv3_i"))); |
||||
float3 _Z6remquoDv3_fS_PU3AS4Dv3_i(float3, float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z6remquoDv3_fS_PU3AS1Dv3_i"))); |
||||
float3 _Z6sincosDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z6sincosDv3_fPU3AS1S_"))); |
||||
float3 _Z8lgamma_rDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z8lgamma_rDv3_fPU3AS1Dv3_i"))); |
||||
float4 _Z4modfDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z4modfDv4_fPU3AS1S_"))); |
||||
float4 _Z5fractDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z5fractDv4_fPU3AS1S_"))); |
||||
float4 _Z5frexpDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z5frexpDv4_fPU3AS1Dv4_i"))); |
||||
float4 _Z6remquoDv4_fS_PU3AS4Dv4_i(float4, float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z6remquoDv4_fS_PU3AS1Dv4_i"))); |
||||
float4 _Z6sincosDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z6sincosDv4_fPU3AS1S_"))); |
||||
float4 _Z8lgamma_rDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z8lgamma_rDv4_fPU3AS1Dv4_i"))); |
||||
float8 _Z4modfDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z4modfDv8_fPU3AS1S_"))); |
||||
float8 _Z5fractDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z5fractDv8_fPU3AS1S_"))); |
||||
float8 _Z5frexpDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z5frexpDv8_fPU3AS1Dv8_i"))); |
||||
float8 _Z6remquoDv8_fS_PU3AS4Dv8_i(float8, float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z6remquoDv8_fS_PU3AS1Dv8_i"))); |
||||
float8 _Z6sincosDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z6sincosDv8_fPU3AS1S_"))); |
||||
float8 _Z8lgamma_rDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z8lgamma_rDv8_fPU3AS1Dv8_i"))); |
||||
float16 _Z4modfDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z4modfDv16_fPU3AS1S_"))); |
||||
float16 _Z5fractDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z5fractDv16_fPU3AS1S_"))); |
||||
float16 _Z5frexpDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z5frexpDv16_fPU3AS1Dv16_i"))); |
||||
float16 _Z6remquoDv16_fS_PU3AS4Dv16_i(float16, float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z6remquoDv16_fS_PU3AS1Dv16_i"))); |
||||
float16 _Z6sincosDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z6sincosDv16_fPU3AS1S_"))); |
||||
float16 _Z8lgamma_rDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z8lgamma_rDv16_fPU3AS1Dv16_i"))); |
||||
|
||||
char2 _Z6vload2jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kc"))); |
||||
uchar2 _Z6vload2jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kh"))); |
||||
short2 _Z6vload2jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ks"))); |
||||
ushort2 _Z6vload2jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kt"))); |
||||
int2 _Z6vload2jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ki"))); |
||||
uint2 _Z6vload2jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kj"))); |
||||
long2 _Z6vload2jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kl"))); |
||||
ulong2 _Z6vload2jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload2jPU3AS1Km"))); |
||||
float2 _Z6vload2jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kf"))); |
||||
char3 _Z6vload3jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kc"))); |
||||
uchar3 _Z6vload3jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kh"))); |
||||
short3 _Z6vload3jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ks"))); |
||||
ushort3 _Z6vload3jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kt"))); |
||||
int3 _Z6vload3jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ki"))); |
||||
uint3 _Z6vload3jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kj"))); |
||||
long3 _Z6vload3jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kl"))); |
||||
ulong3 _Z6vload3jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload3jPU3AS1Km"))); |
||||
float3 _Z6vload3jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kf"))); |
||||
char4 _Z6vload4jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kc"))); |
||||
uchar4 _Z6vload4jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kh"))); |
||||
short4 _Z6vload4jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ks"))); |
||||
ushort4 _Z6vload4jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kt"))); |
||||
int4 _Z6vload4jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ki"))); |
||||
uint4 _Z6vload4jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kj"))); |
||||
long4 _Z6vload4jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kl"))); |
||||
ulong4 _Z6vload4jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload4jPU3AS1Km"))); |
||||
float4 _Z6vload4jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kf"))); |
||||
char8 _Z6vload8jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kc"))); |
||||
uchar8 _Z6vload8jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kh"))); |
||||
short8 _Z6vload8jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ks"))); |
||||
ushort8 _Z6vload8jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kt"))); |
||||
int8 _Z6vload8jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ki"))); |
||||
uint8 _Z6vload8jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kj"))); |
||||
long8 _Z6vload8jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kl"))); |
||||
ulong8 _Z6vload8jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload8jPU3AS1Km"))); |
||||
float8 _Z6vload8jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kf"))); |
||||
char16 _Z7vload16jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kc"))); |
||||
uchar16 _Z7vload16jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kh"))); |
||||
short16 _Z7vload16jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ks"))); |
||||
ushort16 _Z7vload16jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kt"))); |
||||
int16 _Z7vload16jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ki"))); |
||||
uint16 _Z7vload16jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kj"))); |
||||
long16 _Z7vload16jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kl"))); |
||||
ulong16 _Z7vload16jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vload16jPU3AS1Km"))); |
||||
float16 _Z7vload16jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kf"))); |
||||
|
||||
void _Z7vstore2Dv2_cjPU3AS4c(char2, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore2Dv2_cjPU3AS1c"))); |
||||
void _Z7vstore2Dv2_hjPU3AS4h(uchar2, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore2Dv2_hjPU3AS1h"))); |
||||
void _Z7vstore2Dv2_sjPU3AS4s(short2, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore2Dv2_sjPU3AS1s"))); |
||||
void _Z7vstore2Dv2_tjPU3AS4t(ushort2, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore2Dv2_tjPU3AS1t"))); |
||||
void _Z7vstore2Dv2_ijPU3AS4i(int2, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore2Dv2_ijPU3AS1i"))); |
||||
void _Z7vstore2Dv2_jjPU3AS4j(uint2, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore2Dv2_jjPU3AS1j"))); |
||||
void _Z7vstore2Dv2_ljPU3AS4l(long2, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore2Dv2_ljPU3AS1l"))); |
||||
void _Z7vstore2Dv2_mjPU3AS4m(ulong2, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore2Dv2_mjPU3AS1m"))); |
||||
void _Z7vstore2Dv2_fjPU3AS4f(float2, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore2Dv2_fjPU3AS1f"))); |
||||
void _Z7vstore3Dv3_cjPU3AS4c(char3, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore3Dv3_cjPU3AS1c"))); |
||||
void _Z7vstore3Dv3_hjPU3AS4h(uchar3, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore3Dv3_hjPU3AS1h"))); |
||||
void _Z7vstore3Dv3_sjPU3AS4s(short3, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore3Dv3_sjPU3AS1s"))); |
||||
void _Z7vstore3Dv3_tjPU3AS4t(ushort3, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore3Dv3_tjPU3AS1t"))); |
||||
void _Z7vstore3Dv3_ijPU3AS4i(int3, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore3Dv3_ijPU3AS1i"))); |
||||
void _Z7vstore3Dv3_jjPU3AS4j(uint3, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore3Dv3_jjPU3AS1j"))); |
||||
void _Z7vstore3Dv3_ljPU3AS4l(long3, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore3Dv3_ljPU3AS1l"))); |
||||
void _Z7vstore3Dv3_mjPU3AS4m(ulong3, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore3Dv3_mjPU3AS1m"))); |
||||
void _Z7vstore3Dv3_fjPU3AS4f(float3, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore3Dv3_fjPU3AS1f"))); |
||||
void _Z7vstore4Dv4_cjPU3AS4c(char4, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore4Dv4_cjPU3AS1c"))); |
||||
void _Z7vstore4Dv4_hjPU3AS4h(uchar4, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore4Dv4_hjPU3AS1h"))); |
||||
void _Z7vstore4Dv4_sjPU3AS4s(short4, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore4Dv4_sjPU3AS1s"))); |
||||
void _Z7vstore4Dv4_tjPU3AS4t(ushort4, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore4Dv4_tjPU3AS1t"))); |
||||
void _Z7vstore4Dv4_ijPU3AS4i(int4, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore4Dv4_ijPU3AS1i"))); |
||||
void _Z7vstore4Dv4_jjPU3AS4j(uint4, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore4Dv4_jjPU3AS1j"))); |
||||
void _Z7vstore4Dv4_ljPU3AS4l(long4, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore4Dv4_ljPU3AS1l"))); |
||||
void _Z7vstore4Dv4_mjPU3AS4m(ulong4, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore4Dv4_mjPU3AS1m"))); |
||||
void _Z7vstore4Dv4_fjPU3AS4f(float4, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore4Dv4_fjPU3AS1f"))); |
||||
void _Z7vstore8Dv8_cjPU3AS4c(char8, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore8Dv8_cjPU3AS1c"))); |
||||
void _Z7vstore8Dv8_hjPU3AS4h(uchar8, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore8Dv8_hjPU3AS1h"))); |
||||
void _Z7vstore8Dv8_sjPU3AS4s(short8, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore8Dv8_sjPU3AS1s"))); |
||||
void _Z7vstore8Dv8_tjPU3AS4t(ushort8, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore8Dv8_tjPU3AS1t"))); |
||||
void _Z7vstore8Dv8_ijPU3AS4i(int8, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore8Dv8_ijPU3AS1i"))); |
||||
void _Z7vstore8Dv8_jjPU3AS4j(uint8, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore8Dv8_jjPU3AS1j"))); |
||||
void _Z7vstore8Dv8_ljPU3AS4l(long8, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore8Dv8_ljPU3AS1l"))); |
||||
void _Z7vstore8Dv8_mjPU3AS4m(ulong8, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore8Dv8_mjPU3AS1m"))); |
||||
void _Z7vstore8Dv8_fjPU3AS4f(float8, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore8Dv8_fjPU3AS1f"))); |
||||
void _Z8vstore16Dv16_cjPU3AS4c(char16, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z8vstore16Dv16_cjPU3AS1c"))); |
||||
void _Z8vstore16Dv16_hjPU3AS4h(uchar16, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z8vstore16Dv16_hjPU3AS1h"))); |
||||
void _Z8vstore16Dv16_sjPU3AS4s(short16, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z8vstore16Dv16_sjPU3AS1s"))); |
||||
void _Z8vstore16Dv16_tjPU3AS4t(ushort16, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z8vstore16Dv16_tjPU3AS1t"))); |
||||
void _Z8vstore16Dv16_ijPU3AS4i(int16, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8vstore16Dv16_ijPU3AS1i"))); |
||||
void _Z8vstore16Dv16_jjPU3AS4j(uint16, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8vstore16Dv16_jjPU3AS1j"))); |
||||
void _Z8vstore16Dv16_ljPU3AS4l(long16, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z8vstore16Dv16_ljPU3AS1l"))); |
||||
void _Z8vstore16Dv16_mjPU3AS4m(ulong16, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z8vstore16Dv16_mjPU3AS1m"))); |
||||
void _Z8vstore16Dv16_fjPU3AS4f(float16, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z8vstore16Dv16_fjPU3AS1f"))); |
||||
|
||||
int _Z10atomic_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vii"))); |
||||
uint _Z10atomic_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vjj"))); |
||||
int _Z8atom_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_andPU3AS1Vii"))); |
||||
uint _Z8atom_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_andPU3AS1Vjj"))); |
||||
int _Z9atomic_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vii"))); |
||||
uint _Z9atomic_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vjj"))); |
||||
int _Z7atom_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z7atom_orPU3AS1Vii"))); |
||||
uint _Z7atom_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z7atom_orPU3AS1Vjj"))); |
||||
int _Z10atomic_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vii"))); |
||||
uint _Z10atomic_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vjj"))); |
||||
int _Z8atom_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vii"))); |
||||
uint _Z8atom_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vjj"))); |
||||
int _Z10atomic_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vi"))); |
||||
uint _Z10atomic_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vj"))); |
||||
int _Z8atom_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vi"))); |
||||
uint _Z8atom_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vj"))); |
||||
int _Z10atomic_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vi"))); |
||||
uint _Z10atomic_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vj"))); |
||||
int _Z8atom_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vi"))); |
||||
uint _Z8atom_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vj"))); |
||||
int _Z10atomic_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vii"))); |
||||
uint _Z10atomic_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vjj"))); |
||||
int _Z8atom_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vii"))); |
||||
uint _Z8atom_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vjj"))); |
||||
int _Z10atomic_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vii"))); |
||||
uint _Z10atomic_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vjj"))); |
||||
int _Z8atom_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_minPU3AS1Vii"))); |
||||
uint _Z8atom_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_minPU3AS1Vjj"))); |
||||
int _Z10atomic_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vii"))); |
||||
uint _Z10atomic_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vjj"))); |
||||
int _Z8atom_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_addPU3AS1Vii"))); |
||||
uint _Z8atom_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_addPU3AS1Vjj"))); |
||||
int _Z10atomic_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vii"))); |
||||
uint _Z10atomic_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vjj"))); |
||||
int _Z8atom_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_subPU3AS1Vii"))); |
||||
uint _Z8atom_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_subPU3AS1Vjj"))); |
||||
int _Z11atomic_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vii"))); |
||||
uint _Z11atomic_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vjj"))); |
||||
float _Z11atomic_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vff"))); |
||||
int _Z9atom_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vii"))); |
||||
uint _Z9atom_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vjj"))); |
||||
float _Z9atom_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vff"))); |
||||
int _Z12atom_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Viii"))); |
||||
uint _Z12atom_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Vjjj"))); |
||||
int _Z14atomic_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Viii"))); |
||||
uint _Z14atomic_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Vjjj"))); |
||||
|
||||
/*
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event"))); |
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event"))); |
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event"))); |
||||
|
||||
TODO missing wait_group_events function(s) |
||||
|
||||
void _Z8prefetchPU3AS1Kcj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kcj"))); |
||||
void _Z8prefetchPU3AS1KDv16_cj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_cj"))); |
||||
void _Z8prefetchPU3AS1KDv16_fj(__attribute__((address_space(4))) float16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_fj"))); |
||||
void _Z8prefetchPU3AS1KDv16_hj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_hj"))); |
||||
void _Z8prefetchPU3AS1KDv16_ij(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_ij"))); |
||||
void _Z8prefetchPU3AS1KDv16_jj(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_jj"))); |
||||
void _Z8prefetchPU3AS1KDv16_sj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_sj"))); |
||||
void _Z8prefetchPU3AS1KDv16_tj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_tj"))); |
||||
void _Z8prefetchPU3AS1KDv2_cj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_cj"))); |
||||
void _Z8prefetchPU3AS1KDv2_fj(__attribute__((address_space(4))) float2*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_fj"))); |
||||
void _Z8prefetchPU3AS1KDv2_hj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_hj"))); |
||||
void _Z8prefetchPU3AS1KDv2_ij(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_ij"))); |
||||
void _Z8prefetchPU3AS1KDv2_jj(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_jj"))); |
||||
void _Z8prefetchPU3AS1KDv2_sj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_sj"))); |
||||
void _Z8prefetchPU3AS1KDv2_tj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_tj"))); |
||||
void _Z8prefetchPU3AS1KDv3_cj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_cj"))); |
||||
void _Z8prefetchPU3AS1KDv3_fj(__attribute__((address_space(4))) float3*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_fj"))); |
||||
void _Z8prefetchPU3AS1KDv3_hj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_hj"))); |
||||
void _Z8prefetchPU3AS1KDv3_ij(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_ij"))); |
||||
void _Z8prefetchPU3AS1KDv3_jj(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_jj"))); |
||||
void _Z8prefetchPU3AS1KDv3_sj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_sj"))); |
||||
void _Z8prefetchPU3AS1KDv3_tj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_tj"))); |
||||
void _Z8prefetchPU3AS1KDv4_cj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_cj"))); |
||||
void _Z8prefetchPU3AS1KDv4_fj(__attribute__((address_space(4))) float4*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_fj"))); |
||||
void _Z8prefetchPU3AS1KDv4_hj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_hj"))); |
||||
void _Z8prefetchPU3AS1KDv4_ij(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_ij"))); |
||||
void _Z8prefetchPU3AS1KDv4_jj(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_jj"))); |
||||
void _Z8prefetchPU3AS1KDv4_sj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_sj"))); |
||||
void _Z8prefetchPU3AS1KDv4_tj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_tj"))); |
||||
void _Z8prefetchPU3AS1KDv8_cj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_cj"))); |
||||
void _Z8prefetchPU3AS1KDv8_fj(__attribute__((address_space(4))) float8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_fj"))); |
||||
void _Z8prefetchPU3AS1KDv8_hj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_hj"))); |
||||
void _Z8prefetchPU3AS1KDv8_ij(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_ij"))); |
||||
void _Z8prefetchPU3AS1KDv8_jj(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_jj"))); |
||||
void _Z8prefetchPU3AS1KDv8_sj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_sj"))); |
||||
void _Z8prefetchPU3AS1KDv8_tj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_tj"))); |
||||
void _Z8prefetchPU3AS1Kfj(__attribute__((address_space(4))) float*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kfj"))); |
||||
void _Z8prefetchPU3AS1Khj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Khj"))); |
||||
void _Z8prefetchPU3AS1Kij(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kij"))); |
||||
void _Z8prefetchPU3AS1Kjj(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kjj"))); |
||||
void _Z8prefetchPU3AS1Ksj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ksj"))); |
||||
void _Z8prefetchPU3AS1Ktj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ktj"))); |
||||
*/ |
||||
#endif /* VC4CL_GENERIC_MANGLING */ |
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_COMMON_H |
||||
#define VC4CL_COMMON_H |
||||
|
||||
#include "_overloads.h" |
||||
#include "_intrinsics.h" |
||||
|
||||
|
||||
/*
|
||||
* Common functions |
||||
* |
||||
* Some functions have no maximum error in the OpenCL specification, see here: https://github.com/KhronosGroup/OpenCL-Docs/issues/33
|
||||
* |
||||
* degrees -> 2 ULP |
||||
* radians -> 2 ULP |
||||
* mix -> "implementation defined" |
||||
* smoothstep -> "implementation defined" |
||||
* clamp, min, max, step, sign -> 0 ULP |
||||
*/ |
||||
|
||||
SIMPLE_3(float, clamp, float, x, float, minval, float, maxval, fmin(fmax(x, minval), maxval)) |
||||
//TODO version with limits as scalar
|
||||
|
||||
// NOTE: using 0x1.ca5dc2p+5 (= 180/M_PI_F + 1 ULP) is slightly more accurate than using 0x1.ca5dcp+5 (180 / M_PI_F),
|
||||
// but both are accurate enough for 2 ULP maximum error
|
||||
SIMPLE_1(float, degrees, float, radians, 0x1.ca5dc2p+5 * radians) |
||||
|
||||
// Results are undefined for one of the inputs NaN or Inf,
|
||||
// so we can directly call the intrinsic and don't need to handle these inputs explicitly
|
||||
SIMPLE_2(float, max, float, x, float, y, vc4cl_fmax(x, y)) |
||||
SIMPLE_2_SCALAR(float, max, float, x, float, y, vc4cl_fmax(x, y)) |
||||
|
||||
SIMPLE_2(float, min, float, x, float, y, vc4cl_fmin(x, y)) |
||||
SIMPLE_2_SCALAR(float, min, float, x, float, y, vc4cl_fmin(x, y)) |
||||
|
||||
//" Returns the linear blend of x and y implemented as:
|
||||
// x + (y - x) * a
|
||||
// a must be a value in the range 0.0 ... 1.0. If a is not in the range 0.0 ... 1.0, the return values are undefined. "
|
||||
|
||||
SIMPLE_3(float, mix, float, x, float, y, float, a, x + (y - x) * a) |
||||
SIMPLE_3_SCALAR(float, mix, float, x, float, y, float, a, x + (y - x) * a) |
||||
|
||||
SIMPLE_1(float, radians, float, degrees, (M_PI_F / 180) * degrees) |
||||
|
||||
SIMPLE_2(float, step, float, edge, float, val, val < edge ? 0.0f : 1.0f) |
||||
INLINE float2 step(float edge, float2 val) OVERLOADABLE |
||||
{ |
||||
return step((float2)edge, val); |
||||
} |
||||
INLINE float3 step(float edge, float3 val) OVERLOADABLE |
||||
{ |
||||
return step((float3)edge, val); |
||||
} |
||||
INLINE float4 step(float edge, float4 val) OVERLOADABLE |
||||
{ |
||||
return step((float4)edge, val); |
||||
} |
||||
INLINE float8 step(float edge, float8 val) OVERLOADABLE |
||||
{ |
||||
return step((float8)edge, val); |
||||
} |
||||
INLINE float16 step(float edge, float16 val) OVERLOADABLE |
||||
{ |
||||
return step((float16)edge, val); |
||||
} |
||||
|
||||
COMPLEX_3(float, smoothstep, float, edge0, float, edge1, float, val, |
||||
{ |
||||
result_t tmp = clamp((result_t) (val - edge0) / (edge1 - edge0), (result_t)0.0f, (result_t)1.0f); |
||||
return tmp * tmp * (3 - 2 * tmp); |
||||
}) |
||||
INLINE float2 smoothstep(float edge0, float edge1, float2 val) OVERLOADABLE |
||||
{ |
||||
return smoothstep((float2)edge0, (float2)edge1, val); |
||||
} |
||||
INLINE float3 smoothstep(float edge0, float edge1, float3 val) OVERLOADABLE |
||||
{ |
||||
return smoothstep((float3)edge0, (float3)edge1, val); |
||||
} |
||||
INLINE float4 smoothstep(float edge0, float edge1, float4 val) OVERLOADABLE |
||||
{ |
||||
return smoothstep((float4)edge0, (float4)edge1, val); |
||||
} |
||||
INLINE float8 smoothstep(float edge0, float edge1, float8 val) OVERLOADABLE |
||||
{ |
||||
return smoothstep((float8)edge0, (float8)edge1, val); |
||||
} |
||||
INLINE float16 smoothstep(float edge0, float edge1, float16 val) OVERLOADABLE |
||||
{ |
||||
return smoothstep((float16)edge0, (float16)edge1, val); |
||||
} |
||||
|
||||
SIMPLE_1(float, sign, float, val, val > 0.0f ? 1.0f : val < 0.0f ? -1.0f : 0.0f) |
||||
|
||||
#endif /* VC4CL_COMMON_H */ |
||||
|
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_CONFIG_H |
||||
#define VC4CL_CONFIG_H |
||||
|
||||
#include "defines.h" |
||||
|
||||
#include "opencl-c.h" |
||||
|
||||
#ifndef NULL |
||||
#define NULL ((void *)0) |
||||
#endif |
||||
|
||||
/*
|
||||
* Math constants |
||||
*/ |
||||
#define M_LOG210 3.01029995663981195214f /* log_2(10) */ |
||||
#undef NAN |
||||
#define NAN 0x7fffffffU /* same as defined in OpenCL C, but as integer */ |
||||
#undef INF |
||||
#define INF 0x7f800000U |
||||
|
||||
#define ALL_BITS_SET 0xFFFFFFFFU |
||||
|
||||
#endif /* VC4CL_CONFIG_H */ |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_EXTENSIONS_H |
||||
#define VC4CL_EXTENSIONS_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
#include "_intrinsics.h" |
||||
|
||||
|
||||
/*
|
||||
* Loop unroll pragma extension |
||||
* |
||||
* Defines "#pragma unroll <factor>" |
||||
* |
||||
* CLang supports this natively, so we do not need to do anything |
||||
* |
||||
* See https://www.khronos.org/registry/OpenCL/extensions/nv/cl_nv_pragma_unroll.txt
|
||||
* See https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll
|
||||
*/ |
||||
#ifndef cl_nv_pragma_unroll |
||||
#define cl_nv_pragma_unroll 1 |
||||
#endif |
||||
|
||||
/*
|
||||
* ARM core-ID extension |
||||
* |
||||
* Adds function |
||||
* uint arm_get_core_id( void ) |
||||
* which returns the ID of the OpenCL Computation Unit, which is always zero |
||||
* |
||||
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_get_core_id.txt
|
||||
*/ |
||||
#ifndef cl_arm_core_id |
||||
#define cl_arm_core_id 1 |
||||
#endif |
||||
uint arm_get_core_id(void); //prototype, prevents warning
|
||||
uint arm_get_core_id(void) |
||||
{ |
||||
return 0; |
||||
} |
||||
|
||||
/*
|
||||
* 32-bit atomic counters |
||||
* |
||||
* Adds type |
||||
* counter_32_t |
||||
* which is a 32-bit type for atomic counters. counter32_t can only be passed as kernel parameter and cannot be read/assigned. |
||||
* |
||||
* Adds functions |
||||
* uint atomic_inc(counter32_t counter) |
||||
* uint atomic_dec(counter32_t counter) |
||||
* increments/decrements the given counter32_t value atomically. |
||||
* |
||||
* NOTE: Since the syntax/semantics is exactly the same as for the uint version of the standard atomic_inc/atomic_dec functions, counter32_t is used as typedef to an uint pointer. |
||||
* |
||||
* See https://www.khronos.org/registry/OpenCL/extensions/ext/cl_ext_atomic_counters_32.txt
|
||||
*/ |
||||
#ifndef cl_ext_atomic_counters_32 |
||||
#define cl_ext_atomic_counters_32 1 |
||||
#endif |
||||
typedef volatile __global uint* counter32_t; |
||||
//just the prototypes, the implementations reside in _atomics.h
|
||||
uint atomic_inc(counter32_t counter) OVERLOADABLE; |
||||
uint atomic_dec(counter32_t counter) OVERLOADABLE; |
||||
|
||||
/*
|
||||
* Integer dot products |
||||
* |
||||
* Adds functions |
||||
* int arm_dot(char4 a, char4 b) |
||||
* uint arm_dot(uchar4 a, uchar4 b) |
||||
* int arm_dot_acc(char4 a, char4 b, int acc) |
||||
* uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) |
||||
* int arm_dot_acc(short2 a, short2 b, int acc) |
||||
* uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) |
||||
* int arm_dot_acc_sat(char4 a, char4 b, int acc) |
||||
* uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) |
||||
* calculate integer dot product (and additionally adds the scalar value). |
||||
* For the functions xxx_sat, the final addition is saturating. |
||||
* |
||||
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_integer_dot_product.txt
|
||||
*/ |
||||
#ifndef cl_arm_integer_dot_product_int8 |
||||
#define cl_arm_integer_dot_product_int8 1 |
||||
#endif |
||||
#ifndef cl_arm_integer_dot_product_accumulate_int8 |
||||
#define cl_arm_integer_dot_product_accumulate_int8 1 |
||||
#endif |
||||
#ifndef cl_arm_integer_dot_product_accumulate_int16 |
||||
#define cl_arm_integer_dot_product_accumulate_int16 1 |
||||
#endif |
||||
#ifndef cl_arm_integer_dot_product_accumulate_saturate_int8 |
||||
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1 |
||||
#endif |
||||
|
||||
// prototypes to prevent warnings
|
||||
int arm_dot(char4 a, char4 b) OVERLOADABLE; |
||||
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE; |
||||
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE; |
||||
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE; |
||||
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE; |
||||
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE; |
||||
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE; |
||||
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE; |
||||
|
||||
/**
|
||||
* (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) |
||||
*/ |
||||
int arm_dot(char4 a, char4 b) OVERLOADABLE CONST |
||||
{ |
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED); |
||||
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3; |
||||
} |
||||
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE CONST |
||||
{ |
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED); |
||||
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3; |
||||
} |
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ] |
||||
*/ |
||||
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE CONST |
||||
{ |
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED); |
||||
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3; |
||||
} |
||||
|
||||
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST |
||||
{ |
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED); |
||||
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3; |
||||
} |
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) ] |
||||
*/ |
||||
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE CONST |
||||
{ |
||||
int2 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED); |
||||
return acc + tmp.s0 + tmp.s1; |
||||
} |
||||
|
||||
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE CONST |
||||
{ |
||||
uint2 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED); |
||||
return acc + tmp.s0 + tmp.s1; |
||||
} |
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ] |
||||
* |
||||
* The final accumulation is saturating. |
||||
*/ |
||||
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE CONST |
||||
{ |
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED); |
||||
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3); |
||||
} |
||||
|
||||
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST |
||||
{ |
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED); |
||||
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3); |
||||
} |
||||
|
||||
#endif /* VC4CL_EXTENSIONS_H */ |
||||
|
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
/*
|
||||
* Implements a float-float floating point type providing improved accuracy over float32. |
||||
* |
||||
* Algorithms and ideas taken from: |
||||
* - Guillaume da Gracca, David Defour. Implementation of float-float operators on graphics hardware. Real Numbers and |
||||
* Computers 7, Jul 2006, Nancy, France. pp.23-32. hal-00021443 |
||||
* https://hal.archives-ouvertes.fr/hal-00021443 (https://hal.archives-ouvertes.fr/hal-00021443/document)
|
||||
* - https://andrewthall.org/papers/df64_qf128.pdf
|
||||
*/ |
||||
#ifndef VC4CL_FLOAT_FLOAT_H |
||||
#define VC4CL_FLOAT_FLOAT_H |
||||
|
||||
#include "_intrinsics.h" |
||||
|
||||
/**
|
||||
* Type for extended precision floating point values. |
||||
* |
||||
* By combining two 32-bit floats, greatly increases accuracy. Value range is not increased! |
||||
* |
||||
* The "real" value calculates as UPPER + LOWER part. |
||||
* |
||||
* Using a native 64-bit type implicitly provides vector versions (and proper handling by compiler) |
||||
*/ |
||||
typedef ulong FloatFloat; |
||||
typedef ulong2 FloatFloat2; |
||||
typedef ulong3 FloatFloat3; |
||||
typedef ulong4 FloatFloat4; |
||||
typedef ulong8 FloatFloat8; |
||||
typedef ulong16 FloatFloat16; |
||||
|
||||
SIMPLE_1(float, vc4cl_upper, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val))) |
||||
SIMPLE_1(float, vc4cl_lower, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val >> 32))) |
||||
SIMPLE_1(float, vc4cl_lossy, FloatFloat, val, vc4cl_upper(val) + vc4cl_lower(val)) |
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_combine, float, upper, float, lower, { |
||||
result_t upper_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(upper)); |
||||
result_t lower_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(lower)); |
||||
return upper_extended | (lower_extended << 32); |
||||
}) |
||||
|
||||
// faster version of vc4cl_combine(val, 0)
|
||||
SIMPLE_1(FloatFloat, vc4cl_extend, float, val, vc4cl_int_to_ulong(vc4cl_bitcast_uint(val))) |
||||
|
||||
// TODO avoid using this, since it runs against Inf, due to calculating val * 2^15
|
||||
COMPLEX_1(FloatFloat, vc4cl_split, float, val, { |
||||
// 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
|
||||
const float split = (float) (1u << 15); // TODO can be modified for precision
|
||||
arg_t c = (split + 1) * val; |
||||
arg_t high = c - (c - val); |
||||
arg_t low = val - high; |
||||
return vc4cl_combine(high, low); |
||||
}) |
||||
|
||||
// COMPLEX_1(FloatFloat, vc4cl_split, double, val, {
|
||||
// // 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
|
||||
// const double split = (double) (1u << 29); // TODO can be modified for precision
|
||||
// arg_t c = (split + 1) * val;
|
||||
// arg_t high = c - (c - val);
|
||||
// arg_t low = val - high;
|
||||
// return vc4cl_combine(high, low);
|
||||
// })
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_add, float, a, float, b, { |
||||
float_t s = a + b; |
||||
float_t v = s - a; |
||||
float_t e = (a - (s - v)) + (b - v); |
||||
return vc4cl_combine(s, e); |
||||
}) |
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_add, FloatFloat, a, FloatFloat, b, { |
||||
float_t r = vc4cl_upper(a) + vc4cl_upper(b); |
||||
float_t s0 = (((vc4cl_upper(a) - r) + vc4cl_upper(b)) + vc4cl_lower(b)) + vc4cl_lower(a); |
||||
float_t s1 = (((vc4cl_upper(b) - r) + vc4cl_upper(a)) + vc4cl_lower(a)) + vc4cl_lower(b); |
||||
float_t s = fabs(vc4cl_upper(a)) >= fabs(vc4cl_upper(b)) ? s0 : s1; |
||||
return vc4cl_add(r, s); |
||||
}) |
||||
|
||||
SIMPLE_2(FloatFloat, vc4cl_sub, FloatFloat, a, FloatFloat, b, vc4cl_add(a, vc4cl_combine(-vc4cl_upper(b), -vc4cl_lower(b)))) |
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_mul, float, a, float, b, { |
||||
float_t x = a * b; |
||||
result_t a_split = vc4cl_split(a); |
||||
result_t b_split = vc4cl_split(b); |
||||
float_t error1 = x - (vc4cl_upper(a_split) * vc4cl_upper(b_split)); |
||||
float_t error2 = error1 - (vc4cl_lower(a_split) * vc4cl_upper(b_split)); |
||||
float_t error3 = error2 - (vc4cl_upper(a_split) * vc4cl_lower(b_split)); |
||||
float_t y = vc4cl_lower(a_split) * vc4cl_lower(b_split) - error3; |
||||
return vc4cl_combine(x, y); |
||||
}) |
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_mul, FloatFloat, a, FloatFloat, b, { |
||||
result_t t = vc4cl_mul(vc4cl_upper(a), vc4cl_upper(b)); |
||||
float_t t1 = vc4cl_upper(a) * vc4cl_lower(b) + vc4cl_lower(a) * vc4cl_upper(b) + vc4cl_lower(t); |
||||
return vc4cl_add(vc4cl_upper(t), t1); |
||||
}) |
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_div, FloatFloat, a, FloatFloat, b, { |
||||
float_t xn = 1.0f / vc4cl_upper(b); |
||||
float_t yn = vc4cl_upper(a) * xn; |
||||
result_t y = vc4cl_extend(yn); |
||||
float_t diff = vc4cl_upper(vc4cl_sub(a, vc4cl_mul(b, y))); |
||||
result_t prod = vc4cl_mul(xn, diff); |
||||
return vc4cl_add(y, prod); |
||||
}) |
||||
|
||||
COMPLEX_1(FloatFloat, vc4cl_sqrt, FloatFloat, a, { |
||||
float_t xn = rsqrt(vc4cl_upper(a)); |
||||
float_t yn = vc4cl_upper(a) * xn; |
||||
result_t y = vc4cl_extend(yn); |
||||
result_t ynsqr = vc4cl_mul(y, y); // yn^2
|
||||
float_t diff = vc4cl_upper(vc4cl_sub(a, ynsqr)); |
||||
result_t prod = vc4cl_mul(xn, diff) / 2; |
||||
return vc4cl_add(y, prod); |
||||
}) |
||||
|
||||
#endif /* VC4CL_FLOAT_FLOAT_H */ |
@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_GEOMETRY_H |
||||
#define VC4CL_GEOMETRY_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
|
||||
/* a0 b0 a2 * b3 - a3 * b2
|
||||
* a x b = a1 x b1 = a3 * b1 - a1 * b3 |
||||
* a2 b2 a1 * b2 - a2 * b1 |
||||
*/ |
||||
INLINE float3 cross(float3 p0, float3 p1) OVERLOADABLE CONST |
||||
{ |
||||
return (float3) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x); |
||||
} |
||||
|
||||
INLINE float4 cross(float4 p0, float4 p1) OVERLOADABLE CONST |
||||
{ |
||||
return (float4) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x, 0.0f); |
||||
} |
||||
|
||||
/* a0 b0
|
||||
* a * b = a1 * b1 = a1 * b1 + a2 * b2 + a3 * b3 |
||||
* a2 b2 |
||||
*/ |
||||
INLINE float dot(float p0, float p1) OVERLOADABLE CONST |
||||
{ |
||||
return p0 * p1; |
||||
} |
||||
|
||||
INLINE float dot(float2 p0, float2 p1) OVERLOADABLE CONST |
||||
{ |
||||
const float2 tmp = p0 * p1; |
||||
return tmp.x + tmp.y; |
||||
} |
||||
|
||||
INLINE float dot(float3 p0, float3 p1) OVERLOADABLE CONST |
||||
{ |
||||
const float3 tmp = p0 * p1; |
||||
return tmp.x + tmp.y + tmp.z; |
||||
} |
||||
|
||||
INLINE float dot(float4 p0, float4 p1) OVERLOADABLE CONST |
||||
{ |
||||
const float4 tmp = p0 * p1; |
||||
return tmp.x + tmp.y + tmp.z + tmp.w; |
||||
} |
||||
|
||||
float dot(float8 p0, float8 p1) OVERLOADABLE CONST; |
||||
float dot(float16 p0, float16 p1) OVERLOADABLE CONST; |
||||
|
||||
COMPLEX_1_RETURN_SCALAR(float, length, float, p, { |
||||
float tmp = dot(p, p); |
||||
|
||||
// To mitigate overflow errors for edge-cases, reduce large/increase small numbers, this is taken from LLVM libclc
|
||||
// E.g. since dot(x, x) calculates element-wise x^2, every exponent >= 64 goes to Infinity and every exponent <= -64 to zero!
|
||||
float inputFactor = 1.0f; |
||||
float outputFactor = 1.0f; |
||||
outputFactor = tmp == INFINITY ? 0x1.0p+65f : outputFactor; |
||||
inputFactor = tmp == INFINITY ? 0x1.0p-65f : inputFactor; |
||||
outputFactor = vc4cl_is_zero(tmp) ? 0x1.0p-86f : outputFactor; |
||||
inputFactor = vc4cl_is_zero(tmp) ? 0x1.0p+86f : inputFactor; |
||||
|
||||
return sqrt(dot(p * inputFactor, p * inputFactor)) * outputFactor; |
||||
}) |
||||
|
||||
//"Returns the distance between p0 and p1.
|
||||
// This is calculated as length(p0 - p1).
|
||||
SIMPLE_2_RETURN_SCALAR(float, distance, float, p0, float, p1, length(p0 - p1)) |
||||
|
||||
/**
|
||||
* Expected behavior: |
||||
* |
||||
* normalize(v) = v for all elements in v = 0 |
||||
* normalize(v) = vector of NaNs for all elements in v = NaN |
||||
* TODO special case for Inf elements |
||||
*/ |
||||
SIMPLE_1(float, normalize, float, p, p / length(p)) |
||||
|
||||
|
||||
SIMPLE_1_RETURN_SCALAR(float, fast_length, float, p, half_sqrt(dot(p, p))) |
||||
|
||||
SIMPLE_2_RETURN_SCALAR(float, fast_distance, float, p0, float, p1, fast_length(p0 - p1)) |
||||
|
||||
SIMPLE_1(float, fast_normalize, float, p, p * half_rsqrt(dot(p, p))) |
||||
|
||||
#endif /* VC4CL_GEOMETRY_H */ |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,233 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_INTEGER_H |
||||
#define VC4CL_INTEGER_H |
||||
|
||||
#include "_config.h" |
||||
#include "_intrinsics.h" |
||||
|
||||
#define SIMPLE_INTEGER_2(func, argName0, argName1, content) \ |
||||
SIMPLE_2(uchar, func, uchar, argName0, uchar, argName1, content) \
|
||||
SIMPLE_2(char, func, char, argName0, char, argName1, content) \
|
||||
SIMPLE_2(ushort, func, ushort, argName0, ushort, argName1, content) \
|
||||
SIMPLE_2(short, func, short, argName0, short, argName1, content) \
|
||||
SIMPLE_2(uint, func, uint, argName0, uint, argName1, content) \
|
||||
SIMPLE_2(int, func, int, argName0, int, argName1, content) \
|
||||
|
||||
#define SIMPLE_INTEGER_3(func, argName0, argName1, argName2, content) \ |
||||
SIMPLE_3(uchar, func, uchar, argName0, uchar, argName1, uchar, argName2, content) \
|
||||
SIMPLE_3(char, func, char, argName0, char, argName1, char, argName2, content) \
|
||||
SIMPLE_3(ushort, func, ushort, argName0, ushort, argName1, ushort, argName2, content) \
|
||||
SIMPLE_3(short, func, short, argName0, short, argName1, short, argName2, content) \
|
||||
SIMPLE_3(uint, func, uint, argName0, uint, argName1, uint, argName2, content) \
|
||||
SIMPLE_3(int, func, int, argName0, int, argName1, int, argName2, content) \
|
||||
|
||||
|
||||
SIMPLE_1(uchar, abs, char, val, vc4cl_bitcast_uchar(max(vc4cl_extend(val), -vc4cl_extend(val)))) |
||||
SIMPLE_1(uchar, abs, uchar, val, val) |
||||
SIMPLE_1(ushort, abs, short, val, vc4cl_bitcast_ushort(max(vc4cl_extend(val), -vc4cl_extend(val)))) |
||||
SIMPLE_1(ushort, abs, ushort, val, val) |
||||
SIMPLE_1(uint, abs, int, val, vc4cl_bitcast_uint(max(val, -val))) |
||||
SIMPLE_1(uint, abs, uint, val, val) |
||||
SIMPLE_1(ulong, abs, long, val, vc4cl_bitcast_ulong(max(val, -val))) |
||||
SIMPLE_1(ulong, abs, ulong, val, val) |
||||
|
||||
//based on pocl (pocl/lib/kernel/abs_diff.cl)
|
||||
SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x)) |
||||
COMPLEX_2(uchar, abs_diff, char, x, char, y, { |
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = (result_t)abs(x - y); |
||||
result_t flow = abs(x) + abs(y); |
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow; |
||||
}) |
||||
SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x)) |
||||
COMPLEX_2(ushort, abs_diff, short, x, short, y, { |
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = (result_t)abs(x - y); |
||||
result_t flow = abs(x) + abs(y); |
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow; |
||||
}) |
||||
SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x)) |
||||
COMPLEX_2(uint, abs_diff, int, x, int, y, { |
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = abs(x - y); |
||||
result_t flow = abs(x) + abs(y); |
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow; |
||||
}) |
||||
SIMPLE_2(ulong, abs_diff, ulong, x, ulong, y, abs(x > y ? x - y : y - x)) |
||||
COMPLEX_2(ulong, abs_diff, long, x, long, y, { |
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = abs(x - y); |
||||
result_t flow = abs(x) + abs(y); |
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow; |
||||
}) |
||||
|
||||
SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y)) |
||||
SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX))) |
||||
SIMPLE_2(ushort, add_sat, ushort, x, ushort, y, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) + vc4cl_extend(y), (uint) 0, (uint) USHRT_MAX))) |
||||
SIMPLE_2(short, add_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) + vc4cl_extend(y), SHRT_MIN, SHRT_MAX))) |
||||
//based on pocl (pocl/lib/kernel/add_sat.cl)
|
||||
SIMPLE_2(uint, add_sat, uint, x, uint, y, x > ((result_t)UINT_MAX) - y ? UINT_MAX : x + y) |
||||
SIMPLE_2(int, add_sat, int, x, int, y, vc4cl_saturated_add(x, y)) |
||||
|
||||
//"Returns (x + y) >> 1. The intermediate sum does not modulo overflow."
|
||||
SIMPLE_2(uchar, hadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y)) >> 1)) |
||||
SIMPLE_2(char, hadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1))) |
||||
SIMPLE_2(ushort, hadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y)) >> 1)) |
||||
SIMPLE_2(short, hadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1))) |
||||
//based on pocl (pocl/lib/kernel/hadd.cl)
|
||||
SIMPLE_2(uint, hadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1)) |
||||
SIMPLE_2(int, hadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1)) |
||||
SIMPLE_2(ulong, hadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1)) |
||||
SIMPLE_2(long, hadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1)) |
||||
|
||||
//"Returns (x + y + 1) >> 1. The intermediate sum does not modulo overflow."
|
||||
SIMPLE_2(uchar, rhadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1)) |
||||
SIMPLE_2(char, rhadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1))) |
||||
SIMPLE_2(ushort, rhadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1)) |
||||
SIMPLE_2(short, rhadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1))) |
||||
//based on pocl (pocl/lib/kernel/rhadd.cl)
|
||||
SIMPLE_2(uint, rhadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1)) |
||||
SIMPLE_2(int, rhadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1)) |
||||
SIMPLE_2(ulong, rhadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1)) |
||||
SIMPLE_2(long, rhadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1)) |
||||
|
||||
SIMPLE_INTEGER_3(clamp, val, minval, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(uchar, clamp, uchar, val, uchar, minval, uchar, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(char, clamp, char, val, char, minval, char, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(ushort, clamp, ushort, val, ushort, minval, ushort, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(short, clamp, short, val, short, minval, short, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(uint, clamp, uint, val, uint, minval, uint, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(int, clamp, int, val, int, minval, int, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval)) |
||||
SIMPLE_3_TWO_SCALAR(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval)) |
||||
|
||||
SIMPLE_1(uchar, clz, uchar, x, vc4cl_bitcast_uchar(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF))) |
||||
SIMPLE_1(char, clz, char, x, vc4cl_bitcast_char(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF))) |
||||
SIMPLE_1(ushort, clz, ushort, x, vc4cl_bitcast_ushort(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF))) |
||||
SIMPLE_1(short, clz, short, x, vc4cl_bitcast_short(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF))) |
||||
SIMPLE_1(uint, clz, uint, x, vc4cl_bitcast_uint(vc4cl_clz(x))) |
||||
SIMPLE_1(int, clz, int, x, vc4cl_bitcast_int(vc4cl_clz(x))) |
||||
|
||||
SIMPLE_INTEGER_3(mad_hi, x, y, z, mul_hi(x, y) + z) |
||||
|
||||
SIMPLE_3(uchar, mad_sat, uchar, x, uchar, y, uchar, z, vc4cl_bitcast_uchar(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) UCHAR_MAX))) |
||||
SIMPLE_3(char, mad_sat, char, x, char, y, char, z, vc4cl_bitcast_char(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) CHAR_MIN, (int) CHAR_MAX))) |
||||
SIMPLE_3(ushort, mad_sat, ushort, x, ushort, y, ushort, z, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) USHRT_MAX))) |
||||
SIMPLE_3(short, mad_sat, short, x, short, y, short, z, vc4cl_bitcast_short(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) SHRT_MIN, (int) SHRT_MAX))) |
||||
SIMPLE_3(uint, mad_sat, uint, x, uint, y, uint, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_UNSIGNED) + vc4cl_int_to_ulong(z), VC4CL_UNSIGNED)) |
||||
SIMPLE_3(int, mad_sat, int, x, int, y, int, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_SIGNED) + vc4cl_int_to_long(z), VC4CL_SIGNED)) |
||||
|
||||
SIMPLE_2(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y)) |
||||
SIMPLE_2_SCALAR(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y)) |
||||
SIMPLE_2(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2_SCALAR(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED))) |
||||
SIMPLE_2_SCALAR(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED))) |
||||
SIMPLE_2(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2_SCALAR(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2(uint, max, uint, x, uint, y, x > y ? x : y) |
||||
SIMPLE_2_SCALAR(uint, max, uint, x, uint, y, x > y ? x : y) |
||||
SIMPLE_2(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED)) |
||||
SIMPLE_2_SCALAR(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED)) |
||||
COMPLEX_2(ulong, max, ulong, x, ulong, y, |
||||
{ |
||||
uint_t upX = vc4cl_long_to_int(x >> 32); |
||||
uint_t upY = vc4cl_long_to_int(y >> 32); |
||||
uint_t lowX = vc4cl_long_to_int(x); |
||||
uint_t lowY = vc4cl_long_to_int(y); |
||||
|
||||
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */ |
||||
int_t selection = upX > upY ? 0 : (upX < upY ? 1 : (lowX > lowY ? 0 : 1)); |
||||
return vc4cl_int_to_long(selection) == 0 ? x : y; |
||||
}) |
||||
SIMPLE_2_SCALAR(ulong, max, ulong, x, ulong, y, max(x, (arg0_t) y)) |
||||
SIMPLE_2(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED)) |
||||
SIMPLE_2_SCALAR(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED)) |
||||
|
||||
SIMPLE_2(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y)) |
||||
SIMPLE_2_SCALAR(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y)) |
||||
SIMPLE_2(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2_SCALAR(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED))) |
||||
SIMPLE_2_SCALAR(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED))) |
||||
SIMPLE_2(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2_SCALAR(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED))) |
||||
SIMPLE_2(uint, min, uint, x, uint, y, x < y ? x : y) |
||||
SIMPLE_2_SCALAR(uint, min, uint, x, uint, y, x < y ? x : y) |
||||
SIMPLE_2(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED)) |
||||
SIMPLE_2_SCALAR(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED)) |
||||
COMPLEX_2(ulong, min, ulong, x, ulong, y, |
||||
{ |
||||
uint_t upX = vc4cl_long_to_int(x >> 32); |
||||
uint_t upY = vc4cl_long_to_int(y >> 32); |
||||
uint_t lowX = vc4cl_long_to_int(x); |
||||
uint_t lowY = vc4cl_long_to_int(y); |
||||
|
||||
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */ |
||||
int_t selection = upX < upY ? 0 : (upX > upY ? 1 : (lowX < lowY ? 0 : 1)); |
||||
return vc4cl_int_to_long(selection) == 0 ? x : y; |
||||
}) |
||||
SIMPLE_2_SCALAR(ulong, min, ulong, x, ulong, y, min(x, (arg0_t) y)) |
||||
SIMPLE_2(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED)) |
||||
SIMPLE_2_SCALAR(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED)) |
||||
|
||||
SIMPLE_2(uchar, mul_hi, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 8)) |
||||
SIMPLE_2(char, mul_hi, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_mul24(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED), 8))) |
||||
SIMPLE_2(ushort, mul_hi, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 16)) |
||||
SIMPLE_2(short, mul_hi, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_sign_extend(x) * vc4cl_sign_extend(y), 16))) |
||||
SIMPLE_2(uint, mul_hi, uint, x, uint, y, vc4cl_mul_hi(x, y, VC4CL_UNSIGNED)) |
||||
SIMPLE_2(int, mul_hi, int, x, int, y, vc4cl_mul_hi(x, y, VC4CL_SIGNED)) |
||||
|
||||
//Since the rotation is over all 32-bits, for smaller types we need to replicate the value, rotate it and truncate/sign extend the result afterwards
|
||||
SIMPLE_2(uchar, rotate, uchar, x, uchar, y, vc4cl_pack_lsb(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_bitcast_int(vc4cl_zero_extend(y))))) |
||||
SIMPLE_2(char, rotate, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_extend(y)), 24))) |
||||
SIMPLE_2(ushort, rotate, ushort, x, ushort, y, vc4cl_pack_truncate(vc4cl_ror(vc4cl_zero_extend(x) | (vc4cl_zero_extend(x) << 16), -vc4cl_bitcast_int(vc4cl_zero_extend(y))))) |
||||
SIMPLE_2(short, rotate, short, x, short, y, vc4cl_bitcast_short(vc4cl_extend(vc4cl_bitcast_short(vc4cl_ror((vc4cl_sign_extend(x) & (int) 0xFFFF) | (vc4cl_sign_extend(x) << 16), -vc4cl_sign_extend(y)))))) |
||||
SIMPLE_2(uint, rotate, uint, x, uint, y, vc4cl_bitcast_uint(vc4cl_ror(x, -vc4cl_bitcast_int(y)))) |
||||
SIMPLE_2(int, rotate, int, x, int, y, vc4cl_bitcast_int(vc4cl_ror(x, -y))) |
||||
|
||||
SIMPLE_2(uchar, sub_sat, uchar, x, uchar, y, vc4cl_v8subs(x, y)) |
||||
SIMPLE_2(char, sub_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) - vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX))) |
||||
SIMPLE_2(ushort, sub_sat, ushort, x, ushort, y, x < y ? (result_t)0 : x - y) |
||||
SIMPLE_2(short, sub_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) - vc4cl_extend(y), SHRT_MIN, SHRT_MAX))) |
||||
//based on pocl (pocl/lib/kernel/sub_sat.cl)
|
||||
SIMPLE_2(uint, sub_sat, uint, x, uint, y, x < y ? (result_t)0 : x - y) |
||||
SIMPLE_2(int, sub_sat, int, x, int, y, vc4cl_saturated_sub(x, y)) |
||||
|
||||
SIMPLE_2(short, upsample, char, hi, uchar, lo, vc4cl_bitcast_short((vc4cl_sign_extend(hi) << 8) | vc4cl_bitcast_int(vc4cl_zero_extend(lo)))) |
||||
SIMPLE_2(ushort, upsample, uchar, hi, uchar, lo, vc4cl_bitcast_ushort((vc4cl_zero_extend(hi) << 8) | vc4cl_zero_extend(lo))) |
||||
SIMPLE_2(int, upsample, short, hi, ushort, lo, (vc4cl_sign_extend(hi) << 16) | vc4cl_bitcast_int(vc4cl_zero_extend(lo))) |
||||
SIMPLE_2(uint, upsample, ushort, hi, ushort, lo, (vc4cl_zero_extend(hi) << 16) | vc4cl_zero_extend(lo)) |
||||
SIMPLE_2(long, upsample, int, hi, uint, lo, (vc4cl_int_to_long(hi) << 32) | vc4cl_bitcast_long(vc4cl_int_to_ulong(lo))) |
||||
SIMPLE_2(ulong, upsample, uint, hi, uint, lo, (vc4cl_int_to_ulong(hi) << 32) | vc4cl_int_to_ulong(lo)) |
||||
|
||||
//" Returns the number of non-zero bits in x. "
|
||||
SIMPLE_1(uchar, popcount, uchar, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(char, popcount, char, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(ushort, popcount, ushort, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(short, popcount, short, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(uint, popcount, uint, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(int, popcount, int, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(ulong, popcount, ulong, val, vc4cl_popcount(val)) |
||||
SIMPLE_1(long, popcount, long, val, vc4cl_popcount(val)) |
||||
|
||||
SIMPLE_2(uchar, mul24, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED))) |
||||
SIMPLE_2(char, mul24, char, x, char, y, vc4cl_bitcast_char(vc4cl_mul24(x, y, VC4CL_SIGNED))) |
||||
SIMPLE_2(ushort, mul24, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED))) |
||||
SIMPLE_2(short, mul24, short, x, short, y, vc4cl_bitcast_short(vc4cl_mul24(x, y, VC4CL_SIGNED))) |
||||
SIMPLE_2(uint, mul24, uint, x, uint, y, vc4cl_mul24(x, y, VC4CL_UNSIGNED)) |
||||
SIMPLE_2(int, mul24, int, x, int, y, vc4cl_mul24(x, y, VC4CL_SIGNED)) |
||||
SIMPLE_INTEGER_3(mad24, a, b, c, mul24(a, b) + c) |
||||
|
||||
#undef SIMPLE_INTEGER_2 |
||||
#undef SIMPLE_INTEGER_3 |
||||
|
||||
#endif /* VC4CL_INTEGER_H */ |
||||
|
@ -0,0 +1,436 @@
|
||||
/* Declares interfaces for all intrinsic functions
|
||||
* |
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
|
||||
#ifndef VC4CL_INTRINSICS_H |
||||
#define VC4CL_INTRINSICS_H |
||||
|
||||
#include "_overloads.h" |
||||
|
||||
#define VC4CL_SIGNED 0 |
||||
#define VC4CL_UNSIGNED 1 |
||||
|
||||
/*
|
||||
* ALU operations |
||||
* |
||||
* NOTE: These operations directly map to the machine instructions and do not |
||||
* heed other data-types (e.g. vc4cl_clz will always return the leading zeroes to |
||||
* full 32-bit width) |
||||
*/ |
||||
OVERLOAD_2(float, vc4cl_fmax, float, x, float, y) |
||||
OVERLOAD_2(float, vc4cl_fmin, float, x, float, y) |
||||
OVERLOAD_2(float, vc4cl_fmaxabs, float, x, float, y) |
||||
OVERLOAD_2(float, vc4cl_fminabs, float, x, float, y) |
||||
OVERLOAD_1(int, vc4cl_ftoi, float, val) |
||||
OVERLOAD_1(float, vc4cl_itof, int, val) |
||||
|
||||
OVERLOAD_2(int, vc4cl_asr, uint, val, int, offset) |
||||
OVERLOAD_2(int, vc4cl_asr, int, val, int, offset) |
||||
OVERLOAD_2(uint, vc4cl_ror, uint, val, int, offset) |
||||
OVERLOAD_2(int, vc4cl_ror, int, val, int, offset) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_min, int, x, int, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_max, int, x, int, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(long, vc4cl_min, long, x, long, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(long, vc4cl_max, long, x, long, y, uchar, sign) |
||||
OVERLOAD_2(uint, vc4cl_and, uchar, x, uchar, y) |
||||
OVERLOAD_2(int, vc4cl_and, char, x, char, y) |
||||
OVERLOAD_2(uint, vc4cl_and, ushort, x, ushort, y) |
||||
OVERLOAD_2(int, vc4cl_and, short, x, short, y) |
||||
SIMPLE_2(uint, vc4cl_and, uint, x, uint, y, x & y) |
||||
SIMPLE_2(int, vc4cl_and, int, x, int, y, x & y) |
||||
OVERLOAD_1(uint, vc4cl_clz, uint, val) |
||||
OVERLOAD_1(int, vc4cl_clz, int, val) |
||||
|
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uchar, x, uchar, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, char, x, char, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, ushort, x, ushort, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, short, x, short, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uint, x, uint, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, int, x, int, y, uchar, sign) |
||||
|
||||
OVERLOAD_2(uchar, vc4cl_v8adds, uchar, x, uchar, y) |
||||
OVERLOAD_2(uint, vc4cl_v8adds, uint, x, uint, y) |
||||
OVERLOAD_2(uchar, vc4cl_v8subs, uchar, x, uchar, y) |
||||
OVERLOAD_2(uint, vc4cl_v8subs, uint, x, uint, y) |
||||
OVERLOAD_2(uchar, vc4cl_v8min, uchar, x, uchar, y) |
||||
OVERLOAD_2(uint, vc4cl_v8min, uint, x, uint, y) |
||||
OVERLOAD_2(uchar, vc4cl_v8max, uchar, x, uchar, y) |
||||
OVERLOAD_2(uint, vc4cl_v8max, uint, x, uint, y) |
||||
|
||||
/*
|
||||
* Pack/unpack modes |
||||
*/ |
||||
//TODO ALU needs to consume float for this to work
|
||||
//unpacks half to float (UNPACK 1: 16a -> 32)
|
||||
//OVERLOAD_1(float, vc4cl_unpack_half, half, val)
|
||||
//sign-extends short to int (UNPACK 1: 16a -> 32)
|
||||
OVERLOAD_1(int, vc4cl_unpack_sext, short, val) |
||||
//unpacks first byte [0, 1] to float (UNPACK 4: 8a -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte0, uchar, val) |
||||
//unpacks second byte [0, 1] to float (UNPACK 5: 8b -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte1, uchar, val) |
||||
//unpacks third byte [0, 1] to float (UNPACK 6: 8c -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte2, uchar, val) |
||||
//unpacks fourth byte [0, 1] to float (UNPACK 7: 8d -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte3, uchar, val) |
||||
//zero-extend first byte to uint (UNPACK 4: 8a -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte0, uchar, val) |
||||
//zero-extend second byte to uint (UNPACK 5: 8b -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte1, uchar, val) |
||||
//zero-extend third byte to uint (UNPACK 6: 8c -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte2, uchar, val) |
||||
//zero-extend fourth byte to uint (UNPACK 7: 8d -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte3, uchar, val) |
||||
|
||||
//TODO ALU needs to consume float for this to work
|
||||
//packs float into half (PACK 1: 32 -> 16a)
|
||||
//OVERLOAD_1(half, vc4cl_pack_half, float, val)
|
||||
//converts to unsigned 16-bit integer, truncates the result (PACK 1: 32 -> 16a)
|
||||
OVERLOAD_1(ushort, vc4cl_pack_truncate, int, val) |
||||
OVERLOAD_1(ushort, vc4cl_pack_truncate, uint, val) |
||||
//replicates the LSB into all four bytes (PACK 3: 32 -> 8888)
|
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, char, val) |
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, uchar, val) |
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, uint, val) |
||||
//takes the LSB and writes it into LSB (PACK 4: 32 -> 8a)
|
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, char, val) |
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, uchar, val) |
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, uint, val) |
||||
//calculates addition, but saturates the result afterwards (depending on signed integer over-/underflow of addition) (uses PACK 8: 32 -> 32)
|
||||
OVERLOAD_2(int, vc4cl_saturated_add, int, x, int, y) |
||||
//NOTE: Since the 32 -> 32 saturation pack mode works differently for sub, the intrinsic is implemented differently than saturated_add
|
||||
OVERLOAD_2(int, vc4cl_saturated_sub, int, x, int, y) |
||||
//saturates to unsigned byte (PACK 12: 32 -> 8a)
|
||||
OVERLOAD_1(uchar, vc4cl_saturate_lsb, uint, val) |
||||
|
||||
|
||||
/*
|
||||
* SFU calls |
||||
*/ |
||||
OVERLOAD_1(float, vc4cl_sfu_recip, float, val) |
||||
OVERLOAD_1(float, vc4cl_sfu_rsqrt, float, val) |
||||
OVERLOAD_1(float, vc4cl_sfu_log2, float, val) |
||||
OVERLOAD_1(float, vc4cl_sfu_exp2, float, val) |
||||
|
||||
/*
|
||||
* Periphery access |
||||
*/ |
||||
void vc4cl_mutex_lock(void); |
||||
void vc4cl_mutex_unlock(void); |
||||
//read DMA without locking the mutex
|
||||
OVERLOAD_1(int, vc4cl_dma_read, volatile __global int, * ptr) |
||||
OVERLOAD_1(uint, vc4cl_dma_read, volatile __global uint, * ptr) |
||||
OVERLOAD_1(float, vc4cl_dma_read, volatile __global float, * ptr) |
||||
OVERLOAD_1(int, vc4cl_dma_read, volatile __local int, * ptr) |
||||
OVERLOAD_1(uint, vc4cl_dma_read, volatile __local uint, * ptr) |
||||
OVERLOAD_1(float, vc4cl_dma_read, volatile __local float, * ptr) |
||||
//write DMA without locking the mutex
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global int, * ptr, int, val) |
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global uint, * ptr, uint, val) |
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global float, * ptr, float, val) |
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local int, * ptr, int, val) |
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local uint, * ptr, uint, val) |
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local float, * ptr, float, val) |
||||
//copy DMA without locking the mutex
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uchar, *dest, const __local uchar, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global char, *dest, const __local char, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global ushort, *dest, const __local ushort, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global short, *dest, const __local short, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uint, *dest, const __local uint, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global int, *dest, const __local int, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global float, *dest, const __local float, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uchar, *dest, const __global uchar, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local char, *dest, const __global char, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local ushort, *dest, const __global ushort, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local short, *dest, const __global short, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uint, *dest, const __global uint, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local int, *dest, const __global int, *src, size_t, num_elements) |
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local float, *dest, const __global float, *src, size_t, num_elements) |
||||
//load into VPM without locking the mutex
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uchar, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global char, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global ushort, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global short, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uint, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global int, *ptr, size_t, num_elements) |
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global float, *ptr, size_t, num_elements) |
||||
// special handling of 3-element load/store, since LLVM (compliant with the OpenCL standard) by default generates 4-element load/store
|
||||
char3 vc4cl_vload3(const __global char* ptr) OVERLOADABLE; |
||||
char3 vc4cl_vload3(const __local char* ptr) OVERLOADABLE; |
||||
char3 vc4cl_vload3(const __private char* ptr) OVERLOADABLE; |
||||
char3 vc4cl_vload3(const __constant char* ptr) OVERLOADABLE; |
||||
uchar3 vc4cl_vload3(const __global uchar* ptr) OVERLOADABLE; |
||||
uchar3 vc4cl_vload3(const __local uchar* ptr) OVERLOADABLE; |
||||
uchar3 vc4cl_vload3(const __private uchar* ptr) OVERLOADABLE; |
||||
uchar3 vc4cl_vload3(const __constant uchar* ptr) OVERLOADABLE; |
||||
short3 vc4cl_vload3(const __global short* ptr) OVERLOADABLE; |
||||
short3 vc4cl_vload3(const __local short* ptr) OVERLOADABLE; |
||||
short3 vc4cl_vload3(const __private short* ptr) OVERLOADABLE; |
||||
short3 vc4cl_vload3(const __constant short* ptr) OVERLOADABLE; |
||||
ushort3 vc4cl_vload3(const __global ushort* ptr) OVERLOADABLE; |
||||
ushort3 vc4cl_vload3(const __local ushort* ptr) OVERLOADABLE; |
||||
ushort3 vc4cl_vload3(const __private ushort* ptr) OVERLOADABLE; |
||||
ushort3 vc4cl_vload3(const __constant ushort* ptr) OVERLOADABLE; |
||||
int3 vc4cl_vload3(const __global int* ptr) OVERLOADABLE; |
||||
int3 vc4cl_vload3(const __local int* ptr) OVERLOADABLE; |
||||
int3 vc4cl_vload3(const __private int* ptr) OVERLOADABLE; |
||||
int3 vc4cl_vload3(const __constant int* ptr) OVERLOADABLE; |
||||
uint3 vc4cl_vload3(const __global uint* ptr) OVERLOADABLE; |
||||
uint3 vc4cl_vload3(const __local uint* ptr) OVERLOADABLE; |
||||
uint3 vc4cl_vload3(const __private uint* ptr) OVERLOADABLE; |
||||
uint3 vc4cl_vload3(const __constant uint* ptr) OVERLOADABLE; |
||||
float3 vc4cl_vload3(const __global float* ptr) OVERLOADABLE; |
||||
float3 vc4cl_vload3(const __local float* ptr) OVERLOADABLE; |
||||
float3 vc4cl_vload3(const __private float* ptr) OVERLOADABLE; |
||||
float3 vc4cl_vload3(const __constant float* ptr) OVERLOADABLE; |
||||
long3 vc4cl_vload3(const __global long* ptr) OVERLOADABLE; |
||||
long3 vc4cl_vload3(const __local long* ptr) OVERLOADABLE; |
||||
long3 vc4cl_vload3(const __private long* ptr) OVERLOADABLE; |
||||
long3 vc4cl_vload3(const __constant long* ptr) OVERLOADABLE; |
||||
ulong3 vc4cl_vload3(const __global ulong* ptr) OVERLOADABLE; |
||||
ulong3 vc4cl_vload3(const __local ulong* ptr) OVERLOADABLE; |
||||
ulong3 vc4cl_vload3(const __private ulong* ptr) OVERLOADABLE; |
||||
ulong3 vc4cl_vload3(const __constant ulong* ptr) OVERLOADABLE; |
||||
|
||||
void vc4cl_vstore3(__global char* ptr, char3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local char* ptr, char3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private char* ptr, char3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global uchar* ptr, uchar3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local uchar* ptr, uchar3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private uchar* ptr, uchar3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global short* ptr, short3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local short* ptr, short3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private short* ptr, short3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global ushort* ptr, ushort3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local ushort* ptr, ushort3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private ushort* ptr, ushort3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global int* ptr, int3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local int* ptr, int3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private int* ptr, int3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global uint* ptr, uint3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local uint* ptr, uint3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private uint* ptr, uint3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global float* ptr, float3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local float* ptr, float3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private float* ptr, float3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global long* ptr, long3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local long* ptr, long3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private long* ptr, long3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__global ulong* ptr, ulong3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__local ulong* ptr, ulong3 val) OVERLOADABLE; |
||||
void vc4cl_vstore3(__private ulong* ptr, ulong3 val) OVERLOADABLE; |
||||
/*
|
||||
* Work-item functions |
||||
* Mapped to UNIFORM reads |
||||
* |
||||
* local values are stored in the a UNIFORM in this fashion: |
||||
* | 0 | dim2 | dim1 | dim0 | |
||||
* -> to read value of dimension x, calculate: (UNIFORM >> (dim * 8)) & 0xFF |
||||
* |
||||
* This can be compacted in such way, since for a maximum value of 12, the local ID and size fits into 1 Byte |
||||
*/ |
||||
PURE uchar vc4cl_work_dimensions(void); |
||||
PURE uchar vc4cl_local_size(uint dim); |
||||
PURE uchar vc4cl_local_id(uint dim); |
||||
PURE uint vc4cl_num_groups(uint dim); |
||||
PURE uint vc4cl_group_id(uint dim); |
||||
PURE uint vc4cl_global_offset(uint dim); |
||||
PURE uint vc4cl_global_size(uint dim); |
||||
PURE uint vc4cl_global_id(uint dim); |
||||
PURE uchar vc4cl_local_linear_id(void); |
||||
PURE uint vc4cl_global_linear_id(void); |
||||
|
||||
/*
|
||||
* Image functions |
||||
* In CLang, read_only and write_only image-types are separate types. |
||||
* Also in CLang, OpenCL image-types are built-in opaque types |
||||
*/ |
||||
#ifdef __IMAGE_SUPPORT__ |
||||
/*
|
||||
* Texture Config Parameter 0 |
||||
* Broadcom specification, table 15 |
||||
* |
||||
* 0 - 3 | 4 bits | Number of mipmap levels minus 1 |
||||
* 4 - 7 | 4 bits | texture data type (high bit is on config parameter 1) |
||||
* 8 | 1 bit | flip texture Y axis |
||||
* 9 | 1 bit | cube map mode |
||||
* 10 - 11 | 2 bits | cache swizzle |
||||
* 12 - 31 | 20 bits | texture base pointer (multiple of 4KB) |
||||
*/ |
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_basic_setup) |
||||
/*
|
||||
* Texture Config Parameter 1 |
||||
* Broadcom specification, table 16 |
||||
* |
||||
* 0 - 1 | 2 bits | S (x-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border) |
||||
* 2 - 3 | 2 bits | T (y-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border) |
||||
* 4 - 6 | 3 bits | minification filter (interpolation) |
||||
* 7 | 1 bit | magnification filter |
||||
* 8 - 18 | 11 bits | image width (0 = 2048) |
||||
* 19 | 1 bit | flip ETC Y (per block) |
||||
* 20 - 30 | 11 bits | image height (0 = 248) |
||||
* 31 | 1 bit | high bit of texture type |
||||
*/ |
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_access_setup) |
||||
/*
|
||||
* Texture Config Parameters 2 and 3 |
||||
* Broadcom specification, table 17 |
||||
* |
||||
* Cube map stride: |
||||
* 0 | 1 bit | disable automatic LOD, use bias only |
||||
* 12 - 29 | 18 bits | cube map stride (in multiples of 4KB) |
||||
* 30 - 31 | 2 bits | value 1 for cube map stride |
||||
* |
||||
* Child image dimensions: |
||||
* 0 - 10 | 11 bits | child image width (0 = 2048, does not work, see errata HW-2753) |
||||
* 12 - 22 | 11 bits | child image height (0 = 2048, does not work, see errata HW-2753) |
||||
* 30 - 31 | 2 bits | value 2 for child image dimensions |
||||
* |
||||
* Child image offsets: |
||||
* 0 - 10 | 11 bits | child image X offset |
||||
* 12 - 22 | 11 bits | child image Y offset |
||||
* 30 - 31 | 2 bits | value 3 for child image offsets |
||||
*/ |
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_extended_setup) |
||||
/*
|
||||
* To apply a sampler to an image, we need to override the image-access setup UNIFORM before a read with the magnification/minification filters and wrap modes to use |
||||
*/ |
||||
OVERLOAD_ALL_IMAGE_TYPES_1(void, vc4cl_set_image_access_setup, uint, val) |
||||
CONST uint vc4cl_sampler_get_normalized_coords(sampler_t sampler); |
||||
CONST uint vc4cl_sampler_get_addressing_mode(sampler_t sampler); |
||||
CONST uint vc4cl_sampler_get_filter_mode(sampler_t sampler); |
||||
/*
|
||||
* Image read functions |
||||
* |
||||
* The coordinates need to be floating-values in the range [0, 1] and are scaled to the width/height of the image. |
||||
* The returned data is not necessarily <4 x int32>, but up to 4 components with up to 32 bits each, loaded according to the byte-sizes and number of components specified in the channel_type_size and channel_order_size. |
||||
* |
||||
* So, this functions return the data in the native format (as stored in the image-buffer), but correctly distributed across the 4 components. |
||||
*/ |
||||
int4 vc4cl_image_read(read_only image1d_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
int4 vc4cl_image_read(read_only image1d_buffer_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
int4 vc4cl_image_read(read_only image1d_array_t image, float coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
int4 vc4cl_image_read(read_only image2d_t image, float2 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
int4 vc4cl_image_read(read_only image2d_array_t image, float2 coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
int4 vc4cl_image_read(read_only image3d_t image, float4 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE; |
||||
#endif |
||||
|
||||
/*
|
||||
* Type conversions |
||||
*/ |
||||
// TODO use __builtin_convertvector ?? https://clang.llvm.org/docs/LanguageExtensions.html#builtin-convertvector
|
||||
// check available on all compiler versions, generated LLVM IR code!
|
||||
//component-wise bitcasts
|
||||
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, uint, val) |
||||
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, int, val) |
||||
OVERLOAD_1(char, vc4cl_bitcast_char, uint, val) |
||||
OVERLOAD_1(char, vc4cl_bitcast_char, int, val) |
||||
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, uint, val) |
||||
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, int, val) |
||||
OVERLOAD_1(short, vc4cl_bitcast_short, uint, val) |
||||
OVERLOAD_1(short, vc4cl_bitcast_short, int, val) |
||||
SIMPLE_1(uint, vc4cl_bitcast_uint, uint, val, val) |
||||
OVERLOAD_1(uint, vc4cl_bitcast_uint, int, val) |
||||
OVERLOAD_1(int, vc4cl_bitcast_int, uint, val) |
||||
SIMPLE_1(int, vc4cl_bitcast_int, int, val, val) |
||||
|
||||
OVERLOAD_1(uint, vc4cl_bitcast_uint, float, val) |
||||
OVERLOAD_1(float, vc4cl_bitcast_float, uint, val) |
||||
OVERLOAD_1(int, vc4cl_bitcast_int, float, val) |
||||
OVERLOAD_1(float, vc4cl_bitcast_float, int, val) |
||||
|
||||
SIMPLE_1(int, vc4cl_sign_extend, char, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFF) << 24, 24)) |
||||
//SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFFFF) << 16, 16))
|
||||
SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_unpack_sext(val)) |
||||
|
||||
SIMPLE_1(uint, vc4cl_zero_extend, uchar, val, vc4cl_and(val, (arg_t) (0xFFU))) |
||||
SIMPLE_1(uint, vc4cl_zero_extend, ushort, val, vc4cl_and(val, (arg_t) (0xFFFFU))) |
||||
|
||||
SIMPLE_1(uint, vc4cl_extend, uchar, val, vc4cl_zero_extend(val)) |
||||
SIMPLE_1(int, vc4cl_extend, char, val, vc4cl_sign_extend(val)) |
||||
SIMPLE_1(uint, vc4cl_extend, ushort, val, vc4cl_zero_extend(val)) |
||||
SIMPLE_1(int, vc4cl_extend, short, val, vc4cl_sign_extend(val)) |
||||
SIMPLE_1(uint, vc4cl_extend, uint, val, val) |
||||
SIMPLE_1(int, vc4cl_extend, int, val, val) |
||||
SIMPLE_1(ulong, vc4cl_extend, ulong, val, val) |
||||
SIMPLE_1(long, vc4cl_extend, long, val, val) |
||||
|
||||
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, long, val) |
||||
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, ulong, val) |
||||
OVERLOAD_1(long, vc4cl_bitcast_long, ulong, val) |
||||
OVERLOAD_1(long, vc4cl_bitcast_long, long, val) |
||||
OVERLOAD_1(uint, vc4cl_long_to_int, ulong, val) |
||||
OVERLOAD_1(int, vc4cl_long_to_int, long, val) |
||||
OVERLOAD_1(ulong, vc4cl_int_to_ulong, uint, val) |
||||
OVERLOAD_1(long, vc4cl_int_to_long, int, val) |
||||
SIMPLE_1(ulong, vc4cl_extend_to_long, uint, val, vc4cl_int_to_ulong(val)) |
||||
SIMPLE_1(long, vc4cl_extend_to_long, int, val, vc4cl_int_to_long(val)) |
||||
OVERLOAD_2_SCALAR(int, vc4cl_long_to_int_sat, long, val, uchar, sign) |
||||
OVERLOAD_2_SCALAR(uint, vc4cl_long_to_int_sat, ulong, val, uchar, sign) |
||||
OVERLOAD_1(float, vc4cl_long_to_float, long, val) |
||||
OVERLOAD_1(float, vc4cl_ulong_to_float, ulong, val) |
||||
|
||||
/*
|
||||
* Other functions |
||||
*/ |
||||
SIMPLE_1(uchar, vc4cl_msb_set, uchar, val, vc4cl_bitcast_uchar(vc4cl_extend(val >> 7 == (arg_t)1))) |
||||
SIMPLE_1(char, vc4cl_msb_set, char, val, vc4cl_bitcast_char(vc4cl_and((arg_t)(val >> 7), (arg_t)1)) == (arg_t)1) |
||||
SIMPLE_1(ushort, vc4cl_msb_set, ushort, val, vc4cl_bitcast_ushort(vc4cl_extend(val >> 15 == (arg_t)1))) |
||||
SIMPLE_1(short, vc4cl_msb_set, short, val, vc4cl_bitcast_short(vc4cl_and((arg_t)(val >> 15), (arg_t)1)) == (arg_t)1) |
||||
SIMPLE_1(uint, vc4cl_msb_set, uint, val, vc4cl_bitcast_uint(val >> 31 == 1)) |
||||
SIMPLE_1(int, vc4cl_msb_set, int, val, (val < (arg_t)0)) |
||||
SIMPLE_1(long, vc4cl_msb_set, ulong, val, (val >> 63 == 1)) |
||||
SIMPLE_1(long, vc4cl_msb_set, long, val, (val < (arg_t)0)) |
||||
|
||||
OVERLOAD_1(int, vc4cl_is_nan, float, val) |
||||
OVERLOAD_1(int, vc4cl_is_inf_nan, float, val) |
||||
OVERLOAD_1(int, vc4cl_is_zero, float, val) |
||||
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul_hi, int, x, int, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul_hi, uint, x, uint, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(long, vc4cl_mul_full, int, x, int, y, uchar, sign) |
||||
OVERLOAD_3_SCALAR(ulong, vc4cl_mul_full, uint, x, uint, y, uchar, sign) |
||||
|
||||
OVERLOAD_1(uchar, vc4cl_popcount, uchar, val) |
||||
OVERLOAD_1(char, vc4cl_popcount, char, val) |
||||
OVERLOAD_1(ushort, vc4cl_popcount, ushort, val) |
||||
OVERLOAD_1(short, vc4cl_popcount, short, val) |
||||
OVERLOAD_1(uint, vc4cl_popcount, uint, val) |
||||
OVERLOAD_1(int, vc4cl_popcount, int, val) |
||||
OVERLOAD_1(ulong, vc4cl_popcount, ulong, val) |
||||
OVERLOAD_1(long, vc4cl_popcount, long, val) |
||||
|
||||
event_t vc4cl_set_event(event_t ev) CONST; |
||||
|
||||
void vc4cl_barrier(cl_mem_fence_flags); |
||||
|
||||
/*
|
||||
* Vector functions |
||||
*/ |
||||
//Rotates the vector-elements according to the offset (-15 .. +15)
|
||||
//an offset of 5 means rotate up 5 positions (e.g. x.s0 -> y.s5, x.s10 -> y.15, x.s12 -> y.s1
|
||||
//NOTE: the rotation is always all 16 elements!! So functions with vector-size of less than 16 MUST not use the positions shifted in from the remaining vector-elements
|
||||
OVERLOAD_2_SCALAR(uchar, vc4cl_vector_rotate, uchar, val, char, offset) |
||||
OVERLOAD_2_SCALAR(char, vc4cl_vector_rotate, char, val, char, offset) |
||||
OVERLOAD_2_SCALAR(ushort, vc4cl_vector_rotate, ushort, val, char, offset) |
||||
OVERLOAD_2_SCALAR(short, vc4cl_vector_rotate, short, val, char, offset) |
||||
OVERLOAD_2_SCALAR(uint, vc4cl_vector_rotate, uint, val, char, offset) |
||||
OVERLOAD_2_SCALAR(int, vc4cl_vector_rotate, int, val, char, offset) |
||||
OVERLOAD_2_SCALAR(ulong, vc4cl_vector_rotate, ulong, val, char, offset) |
||||
OVERLOAD_2_SCALAR(long, vc4cl_vector_rotate, long, val, char, offset) |
||||
OVERLOAD_2_SCALAR(float, vc4cl_vector_rotate, float, val, char, offset) |
||||
|
||||
/*
|
||||
* For debugging purposes |
||||
*/ |
||||
//The vector element number (0 .. 15)
|
||||
CONST uchar16 vc4cl_element_number(void); |
||||
//the ID of the QPU (the processor)
|
||||
CONST uchar vc4cl_qpu_number(void); |
||||
|
||||
#endif /* VC4CL_INTRINSICS_H */ |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,819 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_OVERLOADS_H |
||||
#define VC4CL_OVERLOADS_H |
||||
|
||||
#include "_config.h" |
||||
|
||||
#ifndef OVERLOADABLE |
||||
#define OVERLOADABLE __attribute__((overloadable)) |
||||
#endif |
||||
/*
|
||||
* "__attribute__((const)) function attribute |
||||
* Many functions examine only the arguments passed to them, and have no effects except for the return value. |
||||
* This is a much stricter class than __attribute__((pure)), because a function is not permitted to read global memory. |
||||
* If a function is known to operate only on its arguments then it can be subject to common sub-expression elimination and loop optimizations." |
||||
* |
||||
* http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/Cacgigch.html
|
||||
*/ |
||||
#ifndef CONST |
||||
#define CONST __attribute__((const)) //tells the compiler, that result won't change
|
||||
#endif |
||||
/*
|
||||
* "__attribute__((pure)) function attribute |
||||
* Many functions have no effects except to return a value, and their return value depends only on the parameters and global variables. |
||||
* Functions of this kind can be subject to data flow analysis and might be eliminated." |
||||
* |
||||
* http://infocenter.arm.com/help/topic/com.arm.doc.dui0491c/Cacigdac.html
|
||||
*/ |
||||
#define PURE __attribute__((pure)) |
||||
#define INLINE __attribute__((always_inline)) __attribute__((flatten)) inline //flatten inlines all call within this function
|
||||
#define FUNC_1(ret, func, argType, argName) ret func(argType argName) OVERLOADABLE |
||||
#ifndef OVERLOAD_1 |
||||
#define OVERLOAD_1(ret, func, argType, argName) \ |
||||
FUNC_1(ret##16, func, argType##16, argName); \
|
||||
FUNC_1(ret##8, func, argType##8, argName); \
|
||||
FUNC_1(ret##4, func, argType##4, argName); \
|
||||
FUNC_1(ret##3, func, argType##3, argName); \
|
||||
FUNC_1(ret##2, func, argType##2, argName); \
|
||||
FUNC_1(ret, func, argType, argName); |
||||
#endif |
||||
|
||||
#ifndef OVERLOAD_1_RETURN_SCALAR |
||||
#define OVERLOAD_1_RETURN_SCALAR(ret, func, argType, argName) \ |
||||
FUNC_1(ret, func, argType##16, argName); \
|
||||
FUNC_1(ret, func, argType##8, argName); \
|
||||
FUNC_1(ret, func, argType##4, argName); \
|
||||
FUNC_1(ret, func, argType##3, argName); \
|
||||
FUNC_1(ret, func, argType##2, argName); \
|
||||
FUNC_1(ret, func, argType, argName); |
||||
#endif |
||||
|
||||
#define FUNC_2(ret, func, argType0, argName0, argType1, argName1) ret func(argType0 argName0, argType1 argName1) OVERLOADABLE |
||||
#ifndef OVERLOAD_2 |
||||
#define OVERLOAD_2(ret, func, argType0, argName0, argType1, argName1) \ |
||||
FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1); \
|
||||
FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1); \
|
||||
FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1); \
|
||||
FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1); \
|
||||
FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1); |
||||
#endif |
||||
|
||||
#ifndef OVERLOAD_2_SCALAR |
||||
#define OVERLOAD_2_SCALAR(ret, func, argType0, argName0, argType1, argName1) \ |
||||
FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1); |
||||
#endif |
||||
|
||||
#ifndef OVERLOAD_2_RETURN_SCALAR |
||||
#define OVERLOAD_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \ |
||||
FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1); \
|
||||
FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1); \
|
||||
FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1); \
|
||||
FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1); \
|
||||
FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1); |
||||
#endif |
||||
|
||||
#ifndef OVERLOAD_2_SCALAR_RETURN_SCALAR |
||||
#define OVERLOAD_2_SCALAR_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \ |
||||
FUNC_2(ret, func, argType0##16, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##8, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##4, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##3, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##2, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1); |
||||
#endif |
||||
|
||||
#define FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) ret func(argType0 argName0, argType1 argName1, argType2 argName2) OVERLOADABLE |
||||
#ifndef OVERLOAD_3 |
||||
#define OVERLOAD_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \ |
||||
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2); \
|
||||
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2); \
|
||||
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2); \
|
||||
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2); \
|
||||
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2); \
|
||||
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2); |
||||
#endif |
||||
|
||||
#ifndef OVERLOAD_3_SCALAR |
||||
#define OVERLOAD_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \ |
||||
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2); \
|
||||
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2); |
||||
#endif |
||||
|
||||
#define FUNC_4(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3) OVERLOADABLE |
||||
|
||||
#define FUNC_5(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3, arg4Type, arg4Name) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3, arg4Type arg4Name) OVERLOADABLE |
||||
|
||||
#ifndef SIMPLE_1 |
||||
#define SIMPLE_1(ret, func, argType, argName, content) \ |
||||
INLINE FUNC_1(ret##16, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##8, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##4, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##3, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##2, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SIMPLE_1_RETURN_SCALAR |
||||
#define SIMPLE_1_RETURN_SCALAR(ret, func, argType, argName, content) \ |
||||
INLINE FUNC_1(ret, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SIMPLE_2 |
||||
#define SIMPLE_2(ret, func, argType0, argName0, argType1, argName1, content) \ |
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SIMPLE_2_RETURN_SCALAR |
||||
#define SIMPLE_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \ |
||||
INLINE FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SIMPLE_2_SCALAR |
||||
#define SIMPLE_2_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \ |
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar part is skipped, since it is too often already defined for e.g. a version taking two vectors
|
||||
#endif |
||||
|
||||
#ifndef SIMPLE_3 |
||||
#define SIMPLE_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \ |
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2##16 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2##8 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2##4 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2##3 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2##2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SIMPLE_3_SCALAR |
||||
#define SIMPLE_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \ |
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
|
||||
#endif |
||||
|
||||
#ifndef SIMPLE_3_TWO_SCALAR |
||||
#define SIMPLE_3_TWO_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \ |
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
|
||||
#endif |
||||
|
||||
#ifndef COMPLEX_1 |
||||
#define COMPLEX_1(ret, func, argType, argName, content) \ |
||||
INLINE FUNC_1(ret##16, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
typedef float##16 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##8, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
typedef float##8 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##4, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
typedef float##4 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##3, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
typedef float##3 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##2, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
typedef float##2 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
typedef float float_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef COMPLEX_1_RETURN_SCALAR |
||||
#define COMPLEX_1_RETURN_SCALAR(ret, func, argType, argName, content) \ |
||||
INLINE FUNC_1(ret, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef COMPLEX_2 |
||||
#define COMPLEX_2(ret, func, argType0, argName0, argType1, argName1, content) \ |
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
typedef uint##16 uint_t; \
|
||||
typedef float##16 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
typedef uint##8 uint_t; \
|
||||
typedef float##8 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
typedef uint##4 uint_t; \
|
||||
typedef float##4 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
typedef uint##3 uint_t; \
|
||||
typedef float##3 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
typedef uint##2 uint_t; \
|
||||
typedef float##2 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
typedef uint uint_t; \
|
||||
typedef float float_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef COMPLEX_3 |
||||
#define COMPLEX_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \ |
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2##16 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2##8 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2##4 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2##3 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2##2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef COMPLEX_3_SCALAR |
||||
#define COMPLEX_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \ |
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES(ret, func) \ |
||||
CONST FUNC_1(ret, func, read_only image1d_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image2d_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image2d_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image3d_t, image); \
|
||||
/* XXX CONST FUNC_1(ret, func, write_only image3d_t, image); */ \
|
||||
CONST FUNC_1(ret, func, read_only image1d_buffer_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_buffer_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image1d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image2d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image2d_array_t, image); |
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_1(ret, func, argType, argName) \ |
||||
FUNC_2(ret, func, read_only image1d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image2d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image2d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image3d_t, image, argType, argName); \
|
||||
/* XXX FUNC_2(ret, func, write_only image3d_t, image, argType, argName); */ \
|
||||
FUNC_2(ret, func, read_only image1d_buffer_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_buffer_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image1d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image2d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image2d_array_t, image, argType, argName); |
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_2(ret, func, arg0Type, arg0Name, arg1Type, arg1Name) \ |
||||
FUNC_3(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
/* XXX FUNC_3(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); */ \
|
||||
FUNC_3(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); |
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_3(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name) \ |
||||
FUNC_4(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
/* XXX FUNC_4(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); */ \
|
||||
FUNC_4(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); |
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_4(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name) \ |
||||
FUNC_5(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
/* XXX FUNC_5(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); */ \
|
||||
FUNC_5(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); |
||||
|
||||
#endif /* VC4CL_OVERLOADS_H */ |
||||
|
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_PRINTF |
||||
#define VC4CL_PRINTF |
||||
|
||||
#include "_config.h" |
||||
|
||||
//void* vc4cl_get_param(uint);
|
||||
//void vc4cl_print_char(char);
|
||||
//
|
||||
//INLINE int printf(__constant const char * restrict format, ...)
|
||||
//{
|
||||
// __constant const char* formatPtr = format;
|
||||
// uint paramIndex = 1;
|
||||
// while(*format != '\0')
|
||||
// {
|
||||
// if(*format == '%')
|
||||
// {
|
||||
// ++formatPtr;
|
||||
// switch(*formatPtr)
|
||||
// {
|
||||
// case '%':
|
||||
// vc4cl_print_char('%');
|
||||
// break;
|
||||
// case 'c':
|
||||
// vc4cl_print_char(*vc4cl_get_param(paramIndex));
|
||||
// case 's':
|
||||
//
|
||||
// }
|
||||
// }
|
||||
// else
|
||||
// vc4cl_print_char(*formatPtr);
|
||||
// ++formatPtr;
|
||||
// }
|
||||
// //TODO
|
||||
// return -1;
|
||||
//}
|
||||
|
||||
#endif /* VC4CL_PRINTF */ |
@ -0,0 +1,341 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_RELATIONAL_H |
||||
#define VC4CL_RELATIONAL_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
#include "_intrinsics.h" |
||||
|
||||
#ifndef COMPARISON_1 |
||||
#define COMPARISON_1(func, content) \ |
||||
INLINE FUNC_1(int##16, func, float##16, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##8, func, float##8, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##4, func, float##4, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##3, func, float##3, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##2, func, float##2, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, float, val) CONST \
|
||||
{ /* 1 instead of -1 here on purpose! */ \
|
||||
return (content) ? 1 : 0; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef COMPARISON_2 |
||||
#define COMPARISON_2(func, content) \ |
||||
INLINE FUNC_2(int##16, func, float##16, x, float##16, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##8, func, float##8, x, float##8, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##4, func, float##4, x, float##4, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##3, func, float##3, x, float##3, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##2, func, float##2, x, float##2, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int, func, float, x, float, y) CONST \
|
||||
{ /* 1 instead of -1 here on purpose! */ \
|
||||
return (content) ? 1 : 0; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef FOR_ALL_ELEMENTS |
||||
#define FOR_ALL_ELEMENTS(func, type, op, conv) \ |
||||
INLINE FUNC_1(int, func, type##16, x) CONST \
|
||||
{ \
|
||||
/* (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf) */ \
|
||||
type##16 val0 = conv(x); \
|
||||
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op s8, s8 op s9, s9 op sa, sa op sb, sb op sc, sc op sd, sd op se, se op sf, sf op s0) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op s8, s6 op s7 op s8 op s9, s7 op s8 op s9 op sa, s8 op s9 op sa op sb, s9 op sa op sb op sc, sa op sb op sc op sd, sb op sc op sd op se, sc op sd op se op sf, ...) */ \
|
||||
const type##16 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
|
||||
/* (s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7, ..., s8 op s9 op sa op ab op sc op sd op se op sf, ...) */ \
|
||||
const type##16 val2 = val1 op vc4cl_vector_rotate(val1, -4); \
|
||||
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 op s8 op s9 op sa op ab op sc op sd op se op sf */ \
|
||||
return (val2 op val1 op vc4cl_vector_rotate(val2, -8)).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##8, x) CONST \
|
||||
{ \
|
||||
/* (s0, s1, s2, s3, s4, s5, s6, s7) */ \
|
||||
type##8 val0 = conv(x); \
|
||||
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op ?) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op ?, s6 op s7 op ? op ?, s7 op ? op ? op ?) */ \
|
||||
const type##8 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
|
||||
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 */ \
|
||||
return (val1 op vc4cl_vector_rotate(val1, -4)).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##4, x) CONST \
|
||||
{ \
|
||||
/* (x, y, z, w) */ \
|
||||
type##4 val0 = conv(x); \
|
||||
/* (x op y, y op z, z op w, w op ?) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (z op w, w op ?, ? op ?, ? op ?) */ \
|
||||
const type##4 val1 = vc4cl_vector_rotate(val0, -2); \
|
||||
/* (x op y op z op w, ...) */ \
|
||||
return (val0 op val1).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##3, x) CONST \
|
||||
{ \
|
||||
type##3 val = conv(x); \
|
||||
return (val.x op val.y op val.z) != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##2, x) CONST \
|
||||
{ \
|
||||
type##2 val = conv(x); \
|
||||
return (val.x op val.y) != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type, x) CONST \
|
||||
{ \
|
||||
type val = conv(x); \
|
||||
return val != 0; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SELECT_SCALAR |
||||
#define SELECT_SCALAR(type, maskType, content) \ |
||||
INLINE FUNC_3(type, select, type, a, type, b, maskType, c) CONST \
|
||||
{ \
|
||||
return content; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef SELECT_VECTOR |
||||
#define SELECT_VECTOR(type, maskType, content) \ |
||||
INLINE FUNC_3(type##2, select, type##2, a, type##2, b, maskType##2, c) CONST \
|
||||
{ \
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##3, select, type##3, a, type##3, b, maskType##3, c) CONST \
|
||||
{ \
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##4, select, type##4, a, type##4, b, maskType##4, c) CONST \
|
||||
{ \
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##8, select, type##8, a, type##8, b, maskType##8, c) CONST \
|
||||
{ \
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##16, select, type##16, a, type##16, b, maskType##16, c) CONST \
|
||||
{ \
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} |
||||
#endif |
||||
|
||||
/*
|
||||
* The checks for NaNs as defined in the specification are done in the intrinsic of the comparison operators: |
||||
* |
||||
* "The relational functions isequal, isgreater, isgreaterequal, isless, islessequal, and islessgreater |
||||
* always return 0 if either argument is not a number (NaN). isnotequal returns 1 if one or both |
||||
* arguments are not a number (NaN) and the argument type is a scalar [...]" |
||||
* - OpenCL 1.2, section 6.12.6 Relational Functions |
||||
*/ |
||||
COMPARISON_2(isequal, x == y) |
||||
COMPARISON_2(isnotequal, x != y) |
||||
COMPARISON_2(isgreater, x > y) |
||||
COMPARISON_2(isgreaterequal, x >= y) |
||||
COMPARISON_2(isless, x < y) |
||||
COMPARISON_2(islessequal, x <= y) |
||||
COMPARISON_2(islessgreater, (x < y) || (x > y)) |
||||
|
||||
// From <cmath>: "A finite value is any floating-point value that is neither infinite nor NaN (Not-A-Number)."
|
||||
COMPARISON_1(isfinite, !vc4cl_is_inf_nan(val)) |
||||
COMPARISON_1(isinf, (vc4cl_bitcast_uint(val) & NAN) == INF) |
||||
COMPARISON_1(isnan, vc4cl_is_nan(val)) |
||||
// From <cmath>: "Returns whether x is a normal value: i.e., whether it is neither infinity, NaN, zero or subnormal."
|
||||
COMPARISON_1(isnormal, !isinf(val) && !isnan(val) && ((vc4cl_bitcast_uint(val) & 0x7F800000) != 0) /* neither zero nor denormal */) |
||||
COMPARISON_2(isordered, isequal(x, x) && isequal(y, y)) |
||||
COMPARISON_2(isunordered, isnan(x) || isnan(y)) |
||||
|
||||
// for vector,directly use asr, for scalar shr. This is way more efficient than everything else (1 instruction)
|
||||
INLINE FUNC_1(int16, signbit, float16, val) CONST |
||||
{ |
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31); |
||||
} |
||||
INLINE FUNC_1(int8, signbit, float8, val) CONST |
||||
{ |
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31); |
||||
} |
||||
INLINE FUNC_1(int4, signbit, float4, val) CONST |
||||
{ |
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31); |
||||
} |
||||
INLINE FUNC_1(int3, signbit, float3, val) CONST |
||||
{ |
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31); |
||||
} |
||||
INLINE FUNC_1(int2, signbit, float2, val) CONST |
||||
{ |
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31); |
||||
} |
||||
INLINE FUNC_1(int, signbit, float, val) CONST |
||||
{ |
||||
return vc4cl_bitcast_uint(val) >> 31; |
||||
} |
||||
|
||||
FOR_ALL_ELEMENTS(any, char, |, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(any, short, |, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(any, int, |, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(any, long, |, vc4cl_msb_set) |
||||
|
||||
FOR_ALL_ELEMENTS(all, char, &, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(all, short, &, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(all, int, &, vc4cl_msb_set) |
||||
FOR_ALL_ELEMENTS(all, long, &, vc4cl_msb_set) |
||||
|
||||
|
||||
//"Each bit of the result is the corresponding bit of a if the corresponding bit of c is 0.
|
||||
// Otherwise it is the corresponding bit of b."
|
||||
//based on pocl (pocl/lib/kernel/bitselect.cl)
|
||||
SIMPLE_3(uchar, bitselect, uchar, a, uchar, b, uchar, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(char, bitselect, char, a, char, b, char, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(ushort, bitselect, ushort, a, ushort, b, ushort, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(short, bitselect, short, a, short, b, short, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(uint, bitselect, uint, a, uint, b, uint, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(int, bitselect, int, a, int, b, int, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(ulong, bitselect, ulong, a, ulong, b, ulong, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(long, bitselect, long, a, long, b, long, c, (~c & a) | (c & b)) |
||||
SIMPLE_3(float, bitselect, float, a, float, b, float, c, vc4cl_bitcast_float((~vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(a)) | (vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(b)))) |
||||
|
||||
//"For a scalar type, result = c ? b : a."
|
||||
SELECT_SCALAR(uchar, uchar, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(uchar, char, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(char, uchar, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(char, char, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(ushort, ushort, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(ushort, short, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(short, ushort, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(short, short, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(uint, uint, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(uint, int, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(int, uint, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(int, int, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(ulong, ulong, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(ulong, long, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(long, ulong, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(long, long, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(float, uint, vc4cl_extend(c) ? b : a) |
||||
SELECT_SCALAR(float, int, vc4cl_extend(c) ? b : a) |
||||
|
||||
//"For each component of a vector type, result[i] = if MSB of c[i] is set ? b[i] : a[i]"
|
||||
SELECT_VECTOR(uchar, uchar, |
||||
{ |
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31); |
||||
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a)))); |
||||
}) |
||||
SELECT_VECTOR(uchar, char, |
||||
{ |
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31); |
||||
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a)))); |
||||
}) |
||||
SELECT_VECTOR(char, char, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(char, uchar, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(ushort, ushort, |
||||
{ |
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31); |
||||
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a)))); |
||||
}) |
||||
SELECT_VECTOR(ushort, short, |
||||
{ |
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31); |
||||
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a)))); |
||||
}) |
||||
SELECT_VECTOR(short, short, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(short, ushort, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(uint, uint, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(uint, int, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(int, int, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(int, uint, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(ulong, ulong, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(ulong, long, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(long, long, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(long, ulong, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(float, uint, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
SELECT_VECTOR(float, int, |
||||
{ |
||||
return vc4cl_msb_set(c) ? b : a; |
||||
}) |
||||
|
||||
#undef COMPARISON_1 |
||||
#undef COMPARISON_2 |
||||
#undef FOR_ALL_ELEMENTS |
||||
#undef SELECT_SCALAR |
||||
#undef SELECT_VECTOR |
||||
|
||||
#endif /* VC4CL_RELATIONAL_H */ |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_SYNCHRONIZATION_H |
||||
#define VC4CL_SYNCHRONIZATION_H |
||||
|
||||
#include "_config.h" |
||||
#include "_work_items.h" |
||||
|
||||
INLINE void barrier(cl_mem_fence_flags flags) OVERLOADABLE |
||||
{ |
||||
vc4cl_barrier(flags); |
||||
} |
||||
|
||||
/*
|
||||
* We do not declare read_mem_fence() and write_mem_fence(), since: |
||||
* - The SPIRV-LLVM-Translator (in older versions, e.g. 7.0) can't handle them passing a non-const flags to the mem_fence() function |
||||
* - We anyway handle mem_fence(), read_mem_fence() and write_mem_fence() in both front-ends the exact same way |
||||
*/ |
||||
#endif /* VC4CL_SYNCHRONIZATION_H */ |
||||
|
@ -0,0 +1,265 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_VECTOR_H |
||||
#define VC4CL_VECTOR_H |
||||
|
||||
#include "_config.h" |
||||
#include "_overloads.h" |
||||
|
||||
#ifndef VECTOR_LOAD |
||||
#define VECTOR_LOAD(type) \ |
||||
INLINE type##2 vload2(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##16 *)(ptr + offset * 16)); \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef VECTOR_STORE |
||||
#define VECTOR_STORE(type) \ |
||||
INLINE void vstore2(type##2 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##16 *)(ptr + offset * 16)) = data; \
|
||||
} \
|
||||
INLINE void vstore2(type##2 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##16 *)(ptr + offset * 16)) = data; \
|
||||
} \
|
||||
INLINE void vstore2(type##2 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##16 *)(ptr + offset * 16)) = data; \
|
||||
} |
||||
#endif |
||||
|
||||
#ifndef VECTOR_SHUFFLE_2 |
||||
#define VECTOR_SHUFFLE_2_INTERNAL(type, maskType, num) \ |
||||
INLINE type##2 shuffle2(type##num x, type##num y, maskType##2 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.x, mask.y); \
|
||||
} \
|
||||
INLINE type##4 shuffle2(type##num x, type##num y, maskType##4 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.x, mask.y, mask.z, mask.w); \
|
||||
} \
|
||||
INLINE type##8 shuffle2(type##num x, type##num y, maskType##8 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7); \
|
||||
} \
|
||||
INLINE type##16 shuffle2(type##num x, type##num y, maskType##16 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7, mask.s8, mask.s9, mask.sa, mask.sb, mask.sc, mask.sd, mask.se, mask.sf); \
|
||||
} \
|
||||
|
||||
#define VECTOR_SHUFFLE_2(type, maskType) \ |
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 2) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 4) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 8) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 16) |
||||
#endif |
||||
|
||||
#ifndef VECTOR_SHUFFLE |
||||
#define VECTOR_SHUFFLE_INTERNAL(type, maskType, num) \ |
||||
INLINE type##2 shuffle(type##num val, maskType##2 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##4 shuffle(type##num val, maskType##4 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##8 shuffle(type##num val, maskType##8 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##16 shuffle(type##num val, maskType##16 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
|
||||
#define VECTOR_SHUFFLE(type, maskType) \ |
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 2) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 4) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 8) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 16) |
||||
#endif |
||||
|
||||
VECTOR_LOAD(uchar) |
||||
VECTOR_LOAD(char) |
||||
VECTOR_LOAD(ushort) |
||||
VECTOR_LOAD(short) |
||||
VECTOR_LOAD(uint) |
||||
VECTOR_LOAD(int) |
||||
VECTOR_LOAD(float) |
||||
VECTOR_LOAD(ulong) |
||||
VECTOR_LOAD(long) |
||||
|
||||
VECTOR_STORE(uchar) |
||||
VECTOR_STORE(char) |
||||
VECTOR_STORE(ushort) |
||||
VECTOR_STORE(short) |
||||
VECTOR_STORE(uint) |
||||
VECTOR_STORE(int) |
||||
VECTOR_STORE(float) |
||||
VECTOR_STORE(ulong) |
||||
VECTOR_STORE(long) |
||||
|
||||
//TODO vload(a)_half, vload(a)_halfn (+rounding) (load half and return converted to float, possible with unpack-modes)
|
||||
//TODO vstore(a)_half, vstore(a)_halfn (+rounding) (store float as half in memory, possible with pack modes)
|
||||
|
||||
/*
|
||||
* TODO shuffle2, but LLVM fails, since the indices for the __builtin intrinsic need to be constant integers! |
||||
VECTOR_SHUFFLE_2(uchar, uchar) |
||||
VECTOR_SHUFFLE_2(char, uchar) |
||||
VECTOR_SHUFFLE_2(ushort, ushort) |
||||
VECTOR_SHUFFLE_2(short, ushort) |
||||
VECTOR_SHUFFLE_2(uint, uint) |
||||
VECTOR_SHUFFLE_2(int, uint) |
||||
VECTOR_SHUFFLE_2(float, uint) |
||||
|
||||
VECTOR_SHUFFLE(uchar, uchar) |
||||
VECTOR_SHUFFLE(char, uchar) |
||||
VECTOR_SHUFFLE(ushort, ushort) |
||||
VECTOR_SHUFFLE(short, ushort) |
||||
VECTOR_SHUFFLE(uint, uint) |
||||
VECTOR_SHUFFLE(int, uint) |
||||
VECTOR_SHUFFLE(float, uint) |
||||
*/ |
||||
|
||||
//shuffle/shuffle2 are handled via intrinsifying the OpenCL function
|
||||
|
||||
#undef VECTOR_LOAD |
||||
#undef VECTOR_STORE |
||||
#undef VECTOR_SHUFFLE_2_INTERNAL |
||||
#undef VECTOR_SHUFFLE_2 |
||||
#undef VECTOR_SHUFFLE_INTERNAL |
||||
#undef VECTOR_SHUFFLE |
||||
|
||||
#endif /* VC4CL_VECTOR_H */ |
||||
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_WORK_ITEMS_H |
||||
#define VC4CL_WORK_ITEMS_H |
||||
|
||||
#include "_intrinsics.h" |
||||
#include "_overloads.h" |
||||
|
||||
INLINE uint get_work_dim(void) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_work_dimensions(); |
||||
} |
||||
|
||||
INLINE size_t get_global_size(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_global_size(dim); |
||||
} |
||||
|
||||
INLINE size_t get_global_id(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_global_id(dim); |
||||
} |
||||
|
||||
INLINE size_t get_local_size(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_local_size(dim); |
||||
} |
||||
|
||||
INLINE size_t get_enqueued_local_size(uint dimindx) OVERLOADABLE CONST |
||||
{ |
||||
// "Returns the same value as that returned by get_local_size(dimindx) if the kernel is executed with a uniform
|
||||
// work-group size."
|
||||
return vc4cl_local_size(dimindx); |
||||
} |
||||
|
||||
INLINE size_t get_local_id(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_local_id(dim); |
||||
} |
||||
|
||||
INLINE size_t get_num_groups(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_num_groups(dim); |
||||
} |
||||
|
||||
INLINE size_t get_group_id(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_group_id(dim); |
||||
} |
||||
|
||||
INLINE size_t get_global_offset(uint dim) OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_global_offset(dim); |
||||
} |
||||
|
||||
INLINE size_t get_global_linear_id() OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_global_linear_id(); |
||||
} |
||||
|
||||
INLINE size_t get_local_linear_id() OVERLOADABLE CONST |
||||
{ |
||||
return vc4cl_local_linear_id(); |
||||
} |
||||
|
||||
#endif /* VC4CL_WORK_ITEMS_H */ |
@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Author: doe300 |
||||
* |
||||
* See the file "LICENSE" for the full license governing this code. |
||||
*/ |
||||
|
||||
#ifndef VC4CL_DEFINES_H |
||||
#define VC4CL_DEFINES_H |
||||
|
||||
#ifndef CL_VERSION_1_0 |
||||
#define CL_VERSION_1_0 100 |
||||
#endif |
||||
#ifndef CL_VERSION_1_1 |
||||
#define CL_VERSION_1_1 110 |
||||
#endif |
||||
#ifndef CL_VERSION_1_2 |
||||
#define CL_VERSION_1_2 120 |
||||
#endif |
||||
#ifndef CL_VERSION_2_0 |
||||
#define CL_VERSION_2_0 200 |
||||
#endif |
||||
#ifndef CL_VERSION_2_1 |
||||
#define CL_VERSION_2_1 210 |
||||
#endif |
||||
#ifndef CL_VERSION_2_2 |
||||
#define CL_VERSION_2_2 220 |
||||
#endif |
||||
|
||||
#undef __OPENCL_VERSION__ |
||||
#define __OPENCL_VERSION__ CL_VERSION_1_2 |
||||
#undef __OPENCL_C_VERSION__ |
||||
#define __OPENCL_C_VERSION__ CL_VERSION_1_2 |
||||
#ifndef __ENDIAN_LITTLE__ |
||||
#define __ENDIAN_LITTLE__ 1 |
||||
#endif |
||||
#ifndef __EMBEDDED_PROFILE__ |
||||
#define __EMBEDDED_PROFILE__ 1 |
||||
#endif |
||||
//#ifndef __IMAGE_SUPPORT__
|
||||
//#define __IMAGE_SUPPORT__ 1
|
||||
//#endif
|
||||
#undef __IMAGE_SUPPORT__ |
||||
|
||||
#ifndef cl_khr_global_int32_base_atomics |
||||
#define cl_khr_global_int32_base_atomics |
||||
#endif |
||||
#ifndef cl_khr_local_int32_base_atomics |
||||
#define cl_khr_local_int32_base_atomics |
||||
#endif |
||||
#ifndef cl_khr_global_int32_extended_atomics |
||||
#define cl_khr_global_int32_extended_atomics |
||||
#endif |
||||
#ifndef cl_khr_local_int32_extended_atomics |
||||
#define cl_khr_local_int32_extended_atomics |
||||
#endif |
||||
#ifndef cl_khr_byte_addressable_store |
||||
#define cl_khr_byte_addressable_store |
||||
#endif |
||||
#ifndef cl_khr_initialize_memory |
||||
#define cl_khr_initialize_memory |
||||
#endif |
||||
|
||||
#ifdef __IMAGE_SUPPORT__ |
||||
#ifndef cl_khr_3d_image_writes |
||||
#define cl_khr_3d_image_writes |
||||
#endif |
||||
#ifndef cl_intel_packed_yuv |
||||
#define cl_intel_packed_yuv |
||||
#endif |
||||
#else |
||||
#undef cl_khr_3d_image_writes |
||||
#undef cl_intel_packed_yuv |
||||
#endif |
||||
|
||||
// additional supported extensions (need to set flag here, since the module is loaded too late)
|
||||
#define cl_nv_pragma_unroll 1 |
||||
#define cl_arm_core_id 1 |
||||
#define cl_ext_atomic_counters_32 1 |
||||
#define cl_arm_integer_dot_product_int8 1 |
||||
#define cl_arm_integer_dot_product_accumulate_int8 1 |
||||
#define cl_arm_integer_dot_product_accumulate_int16 1 |
||||
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1 |
||||
|
||||
// unsupported extensions or optional core features
|
||||
#undef cl_khr_fp16 |
||||
#undef cl_khr_fp64 |
||||
#undef cl_khr_int64_base_atomics |
||||
#undef cl_khr_int64_extended_atomics |
||||
#undef cl_khr_depth_images |
||||
#undef cl_khr_gl_depth_images |
||||
#undef cl_khr_gl_msaa_sharing |
||||
#undef cl_amd_media_ops |
||||
#undef cl_amd_media_ops2 |
||||
// unsupported host-only extensions (disable for safety)
|
||||
#undef cl_khr_gl_sharing |
||||
#undef cl_khr_gl_event |
||||
#undef cl_khr_d3d10_sharing |
||||
#undef cl_khr_dx9_media_sharing |
||||
#undef cl_khr_d3d11_sharing |
||||
#undef cl_khr_image2d_from_buffer |
||||
#undef cl_khr_terminate_context |
||||
#undef cl_khr_egl_image |
||||
#undef cl_khr_egl_event |
||||
|
||||
#endif /* VC4CL_DEFINES_H */ |
Loading…
Reference in new issue