VC4Stdlib

2 years ago · 1c4c363d5c
33 changed files with 28681 additions and 0 deletions
--- a/drivers/videocore4_stdlib/CMakeLists.txt
+++ b/drivers/videocore4_stdlib/CMakeLists.txt
@ -0,0 +1,60 @@
+cmake_minimum_required (VERSION 3.1)
+
+####
+# General configuration
+####
+# Option whether to create deb package
+option(BUILD_DEB_PACKAGE "Enables creating .deb package" ON)
+# Option whether to compile for raspberry-pi (default: ON, for the compatibility)
+option(CROSS_COMPILE "Cross compile for Raspbian" ON)
+option(BUILD_EXPERIMENTAL "Build experimental test program" OFF)
+
+if(NOT BUILD_NUMBER)
+	set(BUILD_NUMBER 9999)
+endif()
+
+project(VC4CLStdLib VERSION 0.4.${BUILD_NUMBER})
+
+
+#Include headers in the project structure
+file( GLOB HDRS "${PROJECT_SOURCE_DIR}/include/*.h")
+add_library(VC4CLStdLib STATIC ${HDRS})
+set_target_properties(VC4CLStdLib PROPERTIES LINKER_LANGUAGE C)
+
+##
+# Installation targets
+##
+# Adds the public headers to the target, so they are exported
+target_include_directories(VC4CLStdLib PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>  $<INSTALL_INTERFACE:include/vc4cl-stdlib>)
+# Creates the install target for the headers
+install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/" DESTINATION include/vc4cl-stdlib FILES_MATCHING PATTERN "*.h")
+# Adds custom uninstall command
+add_custom_target(uninstall "${CMAKE_COMMAND}" -P "cmake_uninstall.cmake")
+
+if (BUILD_EXPERIMENTAL)
+    add_subdirectory(experimental)
+endif (BUILD_EXPERIMENTAL)
+
+####
+# Building package
+####
+if (BUILD_DEB_PACKAGE)
+	message(STATUS "build deb package...")
+
+	set(CPACK_GENERATOR "DEB")
+	set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+	set(CPACK_PACKAGE_NAME "vc4cl-stdlib")
+	string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%d")
+	set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}-${BUILD_TIMESTAMP}")
+        if (CROSS_COMPILE)
+	  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "armhf")
+        else()
+          set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
+        endif()
+	set(CPACK_PACKAGE_VENDOR "doe300")
+	set(CPACK_PACKAGE_CONTACT "[email protected]")
+	set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL C headers for the VC4CL implementation (raspberrypi only)")
+	set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/doe300/VC4CLStdLib")
+	set(CPACK_PACKAGE_FILE_NAME "vc4cl-stdlib-0.4-Linux")
+	include(CPack)
+endif (BUILD_DEB_PACKAGE)
--- a/drivers/videocore4_stdlib/LICENSE
+++ b/drivers/videocore4_stdlib/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/drivers/videocore4_stdlib/Readme.md
+++ b/drivers/videocore4_stdlib/Readme.md
@ -0,0 +1,4 @@
+# VC4CLStdLib
+
+Implementation of the OpenCL standard-library and is required to build the [VC4C] compiler.
+
--- a/drivers/videocore4_stdlib/experimental/CMakeLists.txt
+++ b/drivers/videocore4_stdlib/experimental/CMakeLists.txt
@ -0,0 +1,7 @@
+find_package(OpenCL REQUIRED)
+find_package(Threads REQUIRED)
+
+add_executable(compare_implementations compare_implementations.cpp)
+target_compile_features(compare_implementations PRIVATE cxx_std_14)
+target_compile_options(compare_implementations PRIVATE -g -Og)
+target_link_libraries(compare_implementations OpenCL::OpenCL Threads::Threads)
--- a/drivers/videocore4_stdlib/experimental/cbrt.cl
+++ b/drivers/videocore4_stdlib/experimental/cbrt.cl
@ -0,0 +1,91 @@
+#define arg_t float16
+#define result_t float16
+#define int_t int16
+#define uint_t uint16
+
+#define CONCAT(a, b) a##b
+#define CAT(a, b) CONCAT(a, b)
+
+result_t approx_rootn(arg_t x, int_t n)
+{
+	// Divides the exponent by n and emplaces it back into the number
+	// Adapted from: https://web.archive.org/web/20131227144655/http://metamerist.com/cbrt/cbrt.htm
+	int_t i = CAT(as_, int_t)(x);
+	int_t exp = (i - (int_t) (127 << 23)) / n + (int_t) (127 << 23);
+	return CAT(as_, result_t)((i & (int_t) 0x807FFFFF) | (exp));
+}
+
+result_t approx_cbrt(arg_t f)
+{
+	// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
+	uint_t p = CAT(as_, uint_t)(f);
+	p = p / 3 + 709921077;
+	return CAT(as_, result_t)(p);
+}
+
+result_t cbrt_halley_step(arg_t x, arg_t base)
+{
+	// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
+	result_t x3 = x * x * x;
+	return x * (x3 + base + base) / (x3 + x3 + base);
+}
+
+result_t cbrt_halley(arg_t val)
+{
+	// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
+	arg_t abs = fabs(val);
+	arg_t approx = approx_rootn(abs, 3);
+
+	result_t result = approx;
+#pragma loop unroll
+	for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
+	{
+		result = cbrt_halley_step(result, val);
+	}
+	return copysign(result, val);
+}
+
+__kernel void cbrt_halley_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = cbrt_halley(in[gid]);
+}
+
+result_t cbrt_newton_step(arg_t x, arg_t base)
+{
+	// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
+	return x - (1.0f / 3.0f) * (x - base / (x * x));
+}
+
+result_t cbrt_newton(arg_t val)
+{
+	// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
+	arg_t abs = fabs(val);
+	arg_t approx = approx_cbrt(abs);
+
+	result_t result = approx;
+#pragma loop unroll
+	for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
+	{
+		result = cbrt_newton_step(result, val);
+	}
+	return copysign(result, val);
+}
+
+__kernel void cbrt_newton_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = cbrt_newton(in[gid]);
+}
+
+__kernel void cbrt_builtin_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = cbrt(in[gid]);
+}
+
+__kernel void cbrt_pow_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = pow(in[gid], 1.0f / 3.0f);
+}
--- a/drivers/videocore4_stdlib/experimental/compare_implementations.cpp
+++ b/drivers/videocore4_stdlib/experimental/compare_implementations.cpp
@ -0,0 +1,404 @@
+
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
+#define CL_HPP_ENABLE_EXCEPTIONS 1
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#include <CL/cl.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unistd.h> // geteuid()
+#include <vector>
+
+static constexpr uint32_t DEFAULT_NUM_LINEAR = 12 * 16 * 8;
+static constexpr uint32_t DEFAULT_NUM_RANDOM = 12 * 16 * 8;
+
+// VC4CL performance counters
+#define CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 10)
+#define CL_PROFILING_PERFORMANCE_COUNTER_IDLE_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 11)
+#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL (CL_PROFILING_COMMAND_END + 12)
+#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 13)
+#define CL_PROFILING_PERFORMANCE_COUNTER_L2_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 14)
+
+struct Range
+{
+	float min;
+	float max;
+};
+
+struct ReferenceFunction
+{
+	ReferenceFunction(float (*func)(float)) : numParameters(1), ptr(reinterpret_cast<void *>(func)) {}
+	ReferenceFunction(float (*func)(float, float)) : numParameters(2), ptr(reinterpret_cast<void *>(func)) {}
+	ReferenceFunction(float (*func)(float, float, float)) : numParameters(3), ptr(reinterpret_cast<void *>(func)) {}
+
+	float operator()(float val) const
+	{
+		if(numParameters != 1)
+			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
+		return reinterpret_cast<float (*)(float)>(ptr)(val);
+	}
+
+	float operator()(float val0, float val1) const
+	{
+		if(numParameters != 2)
+			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
+		return reinterpret_cast<float (*)(float, float)>(ptr)(val0, val1);
+	}
+
+	float operator()(float val0, float val1, float val2) const
+	{
+		if(numParameters != 3)
+			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
+		return reinterpret_cast<float (*)(float, float, float)>(ptr)(val0, val1, val2);
+	}
+
+	std::vector<float> operator()(const std::vector<std::vector<float>> &inputs) const
+	{
+		std::vector<float> out(inputs.front().size());
+		for(std::size_t i = 0; i < out.size(); ++i)
+		{
+			if(numParameters == 1)
+				out[i] = (*this)(inputs[0][i]);
+			if(numParameters == 2)
+				out[i] = (*this)(inputs[0][i], inputs[1][i]);
+			if(numParameters == 3)
+				out[i] = (*this)(inputs[0][i], inputs[1][i], inputs[2][i]);
+		}
+		return out;
+	}
+
+	uint8_t numParameters;
+	void *ptr;
+};
+
+struct Test
+{
+	std::string name;
+	ReferenceFunction reference;
+	uint32_t allowedErrorInUlp;
+	std::string sourceFile;
+	std::vector<Range> ranges;
+};
+
+static float identity(float val)
+{
+	return val;
+}
+
+// XXX OpenCL-CTS calculates reference in double, thus is more accurate. So tests being accurate here might not be in
+// the CTS!
+static const std::vector<Test> floatTests = {
+	Test{"log", logf, 4, "log.cl",
+		{
+			{0.5, 1.0}, // reduced range some implementations use
+			{std::numeric_limits<float>::min(), std::numeric_limits<float>::max()} // full range
+		}},
+	Test{"exp", expf, 4, "exp.cl",
+		{
+			{0.0, 0.5f * logf(2.0f)}, // reduced range some implementations use
+			{-87.0f /* everything below e^-87 is subnormal */, 89.0f /* everything above e^89 is Inf */} // full range
+		}},
+	Test{"identity", identity, 0, "identity.cl",
+		{
+			{-10.0f, 10.0f}, {std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
+		}},
+	Test{"cbrt", cbrtf, 4, "cbrt.cl",
+		{
+			{-1.0, 1.0}, // limited range for precision testing
+			{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
+		}},
+	Test{"fma", fmaf, 0, "fma.cl",
+		{
+			{-100.0f, 100.0f}, // reduced range to not run into NaN/Inf
+			{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
+		}}};
+
+static std::vector<float> generateInputData(const Range &range, uint32_t numLinear, uint32_t numRandom)
+{
+	std::vector<float> data{};
+	data.reserve(numLinear + numRandom);
+	auto step = (range.max - range.min) / static_cast<float>(numLinear); // TODO overflows on full ranges
+	for(float val = range.min; val < range.max; val += step)
+		data.emplace_back(val);
+
+	std::random_device rd{};
+	std::default_random_engine gen(rd());
+	std::uniform_real_distribution<> dist{range.min, range.max};
+
+	while(data.size() < (numLinear + numRandom))
+		data.emplace_back(static_cast<float>(dist(gen)));
+
+	return data;
+}
+
+static std::vector<std::vector<float>> generateInputData(
+	const Range &range, uint32_t numLinear, uint32_t numRandom, uint8_t numInputs)
+{
+	std::vector<std::vector<float>> data{};
+	for(uint8_t i = 0; i < numInputs; ++i)
+		data.emplace_back(generateInputData(range, numLinear, numRandom));
+	return data;
+}
+
+static std::vector<cl::Kernel> createKernels(const cl::Context &context, const Test &test)
+{
+	std::stringstream ss;
+	{
+		std::ifstream fis{test.sourceFile};
+		ss << fis.rdbuf();
+	}
+	cl::Program program(context, ss.str(), true);
+
+	std::vector<cl::Kernel> kernels;
+	program.createKernels(&kernels);
+	return kernels;
+}
+
+struct ErrorResult
+{
+	std::vector<float> inputValues;
+	float expected;
+	float actual;
+	uint32_t errorInUlp;
+
+	// ordered by "most wrong" first
+	bool operator<(const ErrorResult &other) const noexcept
+	{
+		if(errorInUlp > other.errorInUlp)
+			return true;
+		if(errorInUlp < other.errorInUlp)
+			return false;
+		return inputValues < other.inputValues;
+	}
+
+	friend std::ostream &operator<<(std::ostream &os, const ErrorResult &error)
+	{
+		os << "Error of " << error.errorInUlp << " ULP for ";
+		if(error.inputValues.size() == 1)
+			os << std::scientific << error.inputValues.front();
+		else if(error.inputValues.size() == 2)
+			os << std::scientific << '{' << error.inputValues.front() << ", " << error.inputValues.back() << '}';
+		else if(error.inputValues.size() == 3)
+			os << std::scientific << '{' << error.inputValues[0] << ", " << error.inputValues[1] << ", "
+			   << error.inputValues[2] << '}';
+		else
+		{
+			os << '{';
+			for(auto input : error.inputValues)
+				os << std::scientific << input << ", ";
+			os << '}';
+		}
+		os << ", expected " << error.expected << ", got " << error.actual << std::defaultfloat;
+		return os;
+	}
+};
+
+template <typename Out, typename In>
+static Out bit_cast(In val)
+{
+	union
+	{
+		In in;
+		Out out;
+	} u;
+	u.in = val;
+	return u.out;
+}
+
+static uint32_t calculateError(float reference, float result, uint32_t allowedErrorInUlp)
+{
+	if(std::isinf(reference) && std::isinf(result) && std::signbit(reference) == std::signbit(result))
+		return 0;
+	if(std::isnan(reference) && std::isnan(result))
+		return 0;
+	// auto ulp = std::abs(reference * std::numeric_limits<float>::epsilon());
+	// float difference = std::abs(result - reference);
+	// if(difference > static_cast<float>(allowedErrorInUlp))
+	// 	return static_cast<uint32_t>(std::ceil(difference / ulp));
+	// return 0;
+	return static_cast<uint32_t>(std::abs(bit_cast<int32_t>(reference) - bit_cast<int32_t>(result)));
+}
+
+static std::pair<std::vector<ErrorResult>, uint32_t> checkResults(const std::vector<std::vector<float>> &inputs,
+	const std::vector<float> &reference, const std::vector<float> &result, uint32_t allowedErrorInUlp)
+{
+	std::vector<ErrorResult> errors;
+	uint32_t maxError = 0;
+
+	for(std::size_t i = 0; i < std::min(reference.size(), result.size()); ++i)
+	{
+		auto error = calculateError(reference.at(i), result.at(i), allowedErrorInUlp);
+		maxError = std::max(maxError, error);
+		if(error > allowedErrorInUlp)
+		{
+			std::vector<float> errorInputs;
+			for(const auto &input : inputs)
+				errorInputs.push_back(input.at(i));
+			errors.emplace_back(ErrorResult{std::move(errorInputs), reference.at(i), result.at(i), error});
+		}
+	}
+
+	std::sort(errors.begin(), errors.end());
+	return std::make_pair(std::move(errors), maxError);
+}
+
+static void runTest(
+	const cl::Context &context, const cl::CommandQueue &queue, const Test &test, uint32_t numLinear, uint32_t numRandom)
+{
+	std::cout << "Running test " << test.sourceFile << " ..." << std::endl;
+	std::cout << "\tRunning " << test.ranges.size() << " ranges with " << (numLinear + numRandom) << " values"
+			  << std::endl;
+	auto kernels = createKernels(context, test);
+	std::cout << "\tTesting " << kernels.size() << " implementations " << std::endl;
+
+	for(const auto &range : test.ranges)
+	{
+		auto inputs = generateInputData(range, numLinear, numRandom, test.reference.numParameters);
+		auto inputSize = inputs.front().size();
+		cl::NDRange globalSize(inputSize / 16);
+		std::vector<float> reference = test.reference(inputs);
+
+		std::vector<cl::Buffer> inputBuffers;
+		for(auto &input : inputs)
+			inputBuffers.emplace_back(queue, input.begin(), input.end(), true);
+		cl::Buffer outputBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, inputSize * sizeof(float));
+
+		for(auto &kernel : kernels)
+		{
+			kernel.setArg(0, outputBuffer);
+			for(std::size_t i = 0; i < inputBuffers.size(); ++i)
+				kernel.setArg(1 + i, inputBuffers[i]);
+
+			std::cout << "\tRunning kernel '" << kernel.getInfo<CL_KERNEL_FUNCTION_NAME>() << "' with "
+					  << (inputSize / 16) << " work-items ... " << std::endl;
+			auto start = std::chrono::steady_clock::now();
+			cl::Event kernelEvent{};
+			queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, cl::NullRange, nullptr, &kernelEvent);
+			kernelEvent.wait();
+			auto end = std::chrono::steady_clock::now();
+			std::cout << "\t- Finished in "
+					  << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us"
+					  << std::endl;
+			std::chrono::nanoseconds deviceDuration{kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
+				kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()};
+			std::cout << "\t- Executed for "
+					  << std::chrono::duration_cast<std::chrono::microseconds>(deviceDuration).count() << " us"
+					  << std::endl;
+			if(geteuid() == 0) // TODO only on hardware
+			{
+				cl_ulong numInstructions = 0;
+				kernelEvent.getProfilingInfo(
+					CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL, &numInstructions);
+				cl_ulong numCycles = 0;
+				kernelEvent.getProfilingInfo(CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL, &numCycles);
+				std::cout << "\t- Executed " << numInstructions << " instructions in " << numCycles << " cycles"
+						  << std::endl;
+			}
+
+			std::vector<float> result(inputSize);
+			queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, inputSize * sizeof(float), result.data());
+			auto errors = checkResults(inputs, reference, result, test.allowedErrorInUlp);
+			std::cout << "\t- Has " << errors.first.size() << " wrong results and a maximum error of " << errors.second
+					  << " ULP (of allowed " << test.allowedErrorInUlp << " ULP)" << std::endl;
+			for(std::size_t i = 0; i < std::min(errors.first.size(), std::size_t{8}); ++i)
+				std::cout << "\t\t" << errors.first[i] << std::endl;
+			if(errors.first.size() > 8)
+				std::cout << "\t\t[...]" << std::endl;
+		}
+	}
+}
+
+static void printHelp()
+{
+	std::cout << "Usage: <program> [<options>] <test> [<test>...]" << std::endl;
+	std::cout << "Options: " << std::endl;
+	std::cout << "\t--help         Shows this help message" << std::endl;
+	std::cout << "\t--linear=<num> Specifies the number of linear test values, defaults to " << DEFAULT_NUM_LINEAR
+			  << std::endl;
+	std::cout << "\t--random=<num> Specifies the number of random test values, defaults to " << DEFAULT_NUM_RANDOM
+			  << std::endl;
+	std::cout << "Available tests: ";
+	for(const auto &test : floatTests)
+		std::cout << test.name << ", ";
+	std::cout << std::endl;
+}
+
+int main(int argc, char **argv)
+{
+	uint32_t numLinear = DEFAULT_NUM_LINEAR;
+	uint32_t numRandom = DEFAULT_NUM_RANDOM;
+
+	if(argc < 2)
+	{
+		printHelp();
+		return EXIT_SUCCESS;
+	}
+
+	auto platform = cl::Platform::get();
+	cl::Device device{};
+	{
+		std::vector<cl::Device> devices;
+		platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+		if(devices.empty())
+		{
+			std::cout << "No device found!" << std::endl;
+			return EXIT_FAILURE;
+		}
+		device = devices.front();
+	}
+	cl::Context context(device);
+	cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE);
+
+	std::vector<std::reference_wrapper<const Test>> selectedTests;
+	for(int i = 1; i < argc; ++i)
+	{
+		if(argv[i][0] == '-')
+		{
+			if(std::string{"--help"} == argv[i])
+			{
+				printHelp();
+				return EXIT_SUCCESS;
+			}
+			else if(strstr(argv[i], "--linear=") == argv[i])
+				numLinear = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--linear=")));
+			else if(strstr(argv[i], "--random=") == argv[i])
+				numRandom = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--random=")));
+			else
+			{
+				std::cout << "Unknown option: " << argv[i] << std::endl;
+				printHelp();
+				return EXIT_FAILURE;
+			}
+		}
+		auto testIt =
+			std::find_if(floatTests.begin(), floatTests.end(), [&](const Test &test) { return test.name == argv[i]; });
+		if(testIt != floatTests.end())
+			selectedTests.emplace_back(std::cref(*testIt));
+		else
+		{
+			std::cout << "No such test '" << argv[i] << "', available tests: ";
+			for(const auto &test : floatTests)
+				std::cout << test.name << ", ";
+			std::cout << std::endl;
+			return EXIT_FAILURE;
+		}
+	}
+
+	for(const auto &test : selectedTests)
+		runTest(context, queue, test.get(), numLinear, numRandom);
+
+	return EXIT_SUCCESS;
+}
--- a/drivers/videocore4_stdlib/experimental/exp.cl
+++ b/drivers/videocore4_stdlib/experimental/exp.cl
@ -0,0 +1,364 @@
+
+#define arg_t float16
+#define result_t float16
+#define int_t int16
+
+#define CONCAT(a, b) a##b
+#define CAT(a, b) CONCAT(a, b)
+
+// vc4cl_split(double) of M_LN2
+#define M_LN2_FF 0xB102E3083F317218
+
+float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
+ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
+ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
+
+result_t pow2(int_t val)
+{
+	// y = 2^x = 1.0 [implied] * 2^(x + offset)
+	int_t tmp = val << 23;
+	// alternative: tmp = (val + 127) << 23;
+	tmp += (int_t) 0x3F800000;
+	return CAT(as_, result_t)(tmp & (int_t) 0x7F800000);
+}
+
+int_t powerOfTwo(arg_t val)
+{
+	// Original code, produces Inf for e^(~10^38)
+	// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f));
+	// Using floor() instead of ceil(),
+	// - fixes Inf for large exponents
+	// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP),
+	// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations
+	return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f));
+}
+
+/*
+ * Taylor series with Horner's method and range reduction,
+ *
+ * https://www.pseudorandom.com/implementing-exp#section-6
+ */
+result_t exp_taylor(arg_t val)
+{
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
+
+	arg_t tk = 1.0f;
+	arg_t tn = 1.0f;
+#pragma loop unroll
+	for(int i = 1; i < 10; i++) // TODO can adjust number of iterations
+	{
+		tk *= r / i;
+		tn += tk;
+	};
+
+	tn = tn * pow2(k);
+	return val < 0 ? 1 / tn : tn;
+}
+
+__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = exp_taylor(in[gid]);
+}
+
+result_t exp_taylor_extended_precision_exact(arg_t val)
+{
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
+
+	ulong16 tk = 0x000000003F800000; // 1.0
+	ulong16 tn = 0x000000003F800000; // 1.0
+
+	tk = vc4cl_mul(tk, r);
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8
+	tn = vc4cl_add(tn, tk);
+
+	tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9
+	tn = vc4cl_add(tn, tk);
+	// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP)
+
+	result_t result = vc4cl_lossy(tn) * pow2(k);
+	return val < 0 ? 1.0f / result : result;
+}
+
+// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
+// {
+// 	uint gid = get_global_id(0);
+// 	out[gid] = exp_taylor_extended_precision_exact(in[gid]);
+// }
+
+// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp
+
+/*
+ * Chebyshev interpolation with range reduction,
+ *
+ * https://www.pseudorandom.com/implementing-exp#section-18
+ */
+result_t exp_chebyshev(arg_t val)
+{
+	// XXX could remove unneeded coefficients once we fix precision
+	const float coefficients[] = {
+		1.266065877752008335598244625214717537923,
+		1.130318207984970054415392055219726613610,
+		0.2714953395340765623657051399899818507081,
+		0.04433684984866380495257149525979922986386,
+		0.00547424044209373265027616843118645948703,
+		0.000542926311913943750362147810307554678760,
+		0.00004497732295429514665469032811091269841937,
+		3.198436462401990505863872976602295688795e-6,
+		1.992124806672795725961064384805589035648e-7,
+		1.103677172551734432616996091335324170860e-8,
+		5.50589607967374725047142040200552692791e-10,
+		2.497956616984982522712010934218766985311e-11,
+		1.039152230678570050499634672423840849837e-12,
+		3.991263356414401512887720401532162026594e-14,
+	};
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
+
+	arg_t ti = 1.0f;
+	arg_t tj = r;
+	arg_t p = coefficients[0] + (coefficients[1] * r);
+#pragma loop unroll
+	for(int i = 2; i < 8; i++) // TODO can adjust number of iterations
+	{
+		arg_t tk = (2 * r * tj) - ti;
+		p += coefficients[i] * tk;
+		ti = tj;
+		tj = tk;
+	}
+
+	p = p * pow2(k);
+	return val < 0 ? 1 / p : p;
+}
+
+__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = exp_chebyshev(in[gid]);
+}
+
+result_t exp_chebyshev_extended_precision_exact(arg_t val)
+{
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
+
+	ulong16 ti = 0x000000003F800000; // 1.0
+	ulong16 tj = r;
+	// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610
+	ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r));
+	r = vc4cl_mul(r, 0x0000000040000000); // 2.0
+
+	ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6
+	ti = tj;
+	tj = tk;
+
+	tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
+	p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7
+	ti = tj;
+	tj = tk;
+	// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP)
+
+	result_t result = vc4cl_lossy(p) * pow2(k);
+	return val < 0 ? 1.0f / result : result;
+}
+
+// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
+// {
+// 	uint gid = get_global_id(0);
+// 	out[gid] = exp_chebyshev_extended_precision_exact(in[gid]);
+// }
+
+/*
+ * Chebyshev interpolation with monomial basis and range reduction,
+ *
+ * https://www.pseudorandom.com/implementing-exp#section-18
+ */
+result_t exp_chebyshev_monomial(arg_t val)
+{
+	// XXX could remove unneeded coefficients once we fix precision
+	// TODO invert order of coefficients and traversal ?!
+	const float coefficients[] = {
+		1.000000000000000,
+		1.000000000000000,
+		0.500000000000002,
+		0.166666666666680,
+		0.041666666666727,
+		0.008333333333342,
+		0.001388888888388,
+		1.984126978734782e-4,
+		2.480158866546844e-5,
+		2.755734045527853e-6,
+		2.755715675968011e-7,
+		2.504861486483735e-8,
+		2.088459690899721e-9,
+		1.632461784798319e-10,
+	};
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
+
+	arg_t pn = 1.143364767943110e-11;
+#pragma loop unroll
+	for(int i = 0; i < 14; i++)
+	{
+		pn = pn * r + coefficients[13 - i];
+	}
+
+	pn = pn * pow2(k);
+	return val < 0 ? 1 / pn : pn;
+}
+
+__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = exp_chebyshev_monomial(in[gid]);
+}
+
+result_t exp_chebyshev_monomial_exact(arg_t val)
+{
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	arg_t kFloat = CAT(convert_, arg_t)(k);
+	arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
+
+	arg_t pn = 1.143364767943110e-11;
+
+	pn = pn * r + 1.632461784798319e-10f;
+	pn = pn * r + 2.088459690899721e-9f;
+	pn = pn * r + 2.504861486483735e-8f;
+	pn = pn * r + 2.755715675968011e-7f;
+	pn = pn * r + 2.755734045527853e-6f;
+	pn = pn * r + 2.480158866546844e-5f;
+	pn = pn * r + 1.984126978734782e-4f;
+	pn = pn * r + 0.001388888888388f;
+	pn = pn * r + 0.008333333333342f;
+	pn = pn * r + 0.041666666666727f;
+	pn = pn * r + 0.166666666666680f;
+	pn = pn * r + 0.500000000000002f;
+	pn = pn * r + 1.000000000000000f;
+	pn = pn * r + 1.000000000000000f;
+
+	pn = pn * pow2(k);
+	return val < 0 ? 1 / pn : pn;
+}
+
+__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = exp_chebyshev_monomial_exact(in[gid]);
+}
+
+result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val)
+{
+	arg_t positive = fabs(val);
+
+	// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
+	int_t k = powerOfTwo(positive);
+	ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
+
+	ulong16 pn = 0x209249252D492492;					  // 1.143364767943110e-11
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
+	pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
+
+	result_t result = vc4cl_lossy(pn) * pow2(k);
+	return val < 0 ? 1.0f / result : result;
+}
+
+// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
+// {
+// 	uint gid = get_global_id(0);
+// 	out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]);
+// }
+
+// TODO Remes from www.netlib.org/fdlibm/e_exp.c
+
+// TODO Matters computational (sections 32.2.2.2 and 32.2.3)
+// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4)
+
+// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1
+// TODO http://www.netlib.org/fdlibm/
+
+__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = exp(in[gid]);
+}
--- a/drivers/videocore4_stdlib/experimental/fma.cl
+++ b/drivers/videocore4_stdlib/experimental/fma.cl
@ -0,0 +1,46 @@
+#define arg_t float16
+#define result_t float16
+#define int_t int16
+#define uint_t uint16
+
+#define CONCAT(a, b) a##b
+#define CAT(a, b) CONCAT(a, b)
+
+float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
+ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
+ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
+ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
+
+result_t fma_simple(arg_t in0, arg_t in1, arg_t in2)
+{
+	return in0 * in1 * in2;
+}
+
+__kernel void fma_simple_kernel(
+	__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
+{
+	uint gid = get_global_id(0);
+	out[gid] = fma_simple(in0[gid], in1[gid], in2[gid]);
+}
+
+result_t fma_extended_precision(arg_t in0, arg_t in1, arg_t in2)
+{
+	ulong16 mul = vc4cl_mul(in0, in1);
+	return vc4cl_lossy(vc4cl_add(mul, vc4cl_extend(in2)));
+}
+
+__kernel void fma_extended_precision_kernel(
+	__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
+{
+	uint gid = get_global_id(0);
+	out[gid] = fma_extended_precision(in0[gid], in1[gid], in2[gid]);
+}
+
+__kernel void fma_builtin_kernel(
+	__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
+{
+	uint gid = get_global_id(0);
+	out[gid] = fma(in0[gid], in1[gid], in2[gid]);
+}
--- a/drivers/videocore4_stdlib/experimental/identity.cl
+++ b/drivers/videocore4_stdlib/experimental/identity.cl
@ -0,0 +1,9 @@
+#define arg_t float16
+#define result_t float16
+#define int_t int16
+
+__kernel void identity_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = in[gid];
+}
--- a/drivers/videocore4_stdlib/experimental/log.cl
+++ b/drivers/videocore4_stdlib/experimental/log.cl
@ -0,0 +1,256 @@
+#define arg_t float16
+#define result_t float16
+#define int_t int16
+
+/*
+ * Helper, arithmetic-geometric-mean,
+ *
+ * https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean
+ */
+result_t agm(arg_t x, arg_t y)
+{
+	arg_t arithm = x;
+	arg_t geom = y;
+	for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations
+	{
+		arg_t arithm_new = (arithm + geom) / (arg_t) 2.0;
+		geom = sqrt(arithm * geom);
+		arithm = arithm_new;
+	}
+	return arithm;
+}
+
+#define CONCAT(a, b) a##b
+#define CAT(a, b) CONCAT(a, b)
+
+#define REDUCE_ARGUMENT_TO_0_1                                                                                         \
+	/* log(S * M * 2^E) = log(S * M) + E log(2) */                                                                     \
+	int_t bitcast = CAT(as_, int_t)(val);                                                                              \
+	/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */                                           \
+	int_t exponent = ((bitcast >> 23) & 0xFF) - 126;                                                                   \
+	/* mask off exponent and replace with exponent for range [0.5, 1) */                                               \
+	int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000;                                    \
+	arg_t mantissa = CAT(as_, result_t)(signedMantissaBits);                                                           \
+	result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F;
+
+/*
+ * Taylor-series,
+ *
+ * https://en.wikipedia.org/wiki/Mercator_series
+ */
+result_t log1p_taylor(arg_t val)
+{
+	// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
+	// converges for -1 < x <= 1 (requires argument reduction)
+
+	REDUCE_ARGUMENT_TO_0_1
+
+	// iteration 1
+	result_t result = mantissa;
+	arg_t power = mantissa;
+#pragma loop unroll
+	for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations
+	{
+		power *= mantissa;
+		arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0;
+		result = result + sign * (arg_t) (1.0 / iteration) * power;
+	}
+	return result + reduced;
+}
+
+__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f);
+}
+
+result_t log1p_taylor_unrolled(arg_t val)
+{
+	// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
+	// converges for -1 < x <= 1 (requires argument reduction)
+
+	REDUCE_ARGUMENT_TO_0_1
+
+	// iteration 1
+	result_t result = mantissa;
+	arg_t power = mantissa;
+
+	// iteration 2
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 2.0) * power;
+
+	// iteration 3
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 3.0) * power;
+
+	// iteration 4
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 4.0) * power;
+
+	// iteration 5
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 5.0) * power;
+
+	// iteration 6
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 6.0) * power;
+
+	// iteration 7
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 7.0) * power;
+
+	// iteration 8
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 8.0) * power;
+
+	// iteration 9
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 9.0) * power;
+
+	// iteration 10
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 10.0) * power;
+
+	// iteration 11
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 11.0) * power;
+
+	// iteration 12
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 12.0) * power;
+
+	// iteration 13
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 13.0) * power;
+
+	// iteration 14
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 14.0) * power;
+
+	// iteration 15
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 15.0) * power;
+
+	// iteration 16
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 16.0) * power;
+
+	// iteration 17
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 17.0) * power;
+
+	// iteration 18
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 18.0) * power;
+
+	// iteration 19
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 19.0) * power;
+
+	// iteration 20
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 20.0) * power;
+
+	// iteration 21
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 21.0) * power;
+
+	// iteration 22
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 22.0) * power;
+
+	// iteration 23
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 23.0) * power;
+
+	// iteration 24
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 24.0) * power;
+
+	// iteration 25
+	power *= mantissa;
+	result = result + (arg_t) (1.0 / 25.0) * power;
+
+	// iteration 26
+	power *= mantissa;
+	result = result - (arg_t) (1.0 / 26.0) * power;
+
+	// TODO can adjust number of iterations
+
+	return result + reduced;
+}
+
+__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f);
+}
+
+/*
+ * Taylor series with optimization, requires argument reduction,
+ *
+ * https://math.stackexchange.com/a/3383716
+ */
+result_t log_taylor(arg_t val)
+{
+	REDUCE_ARGUMENT_TO_0_1
+
+	result_t result = 0;
+	// iteration 1
+	arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0);
+	arg_t factor = tmp * tmp;
+#pragma loop unroll
+	for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations
+	{
+		result += tmp / (arg_t) iteration;
+		tmp *= factor;
+	}
+	return result + reduced;
+}
+
+__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log_taylor(in[gid]);
+}
+
+/*
+ * Using the arithmetic-geometric-mean,
+ *
+ * https://en.wikipedia.org/wiki/Natural_logarithm#High_precision
+ */
+result_t log_agm(arg_t val)
+{
+	const unsigned m = 8; // TODO can adjust for precision
+	arg_t s = val * (arg_t) (1 << m);
+	arg_t mean = agm(1.0, (arg_t) 4.0 / s);
+	return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2);
+}
+
+__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log_agm(in[gid]);
+}
+
+result_t log_agm_reduced(arg_t val)
+{
+	REDUCE_ARGUMENT_TO_0_1
+
+	const unsigned m = 8; // TODO can adjust for precision
+	arg_t s = mantissa * (arg_t) (1 << m);
+	arg_t mean = agm(1.0, (arg_t) 4.0 / s);
+	return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced;
+}
+
+__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log_agm_reduced(in[gid]);
+}
+
+__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in)
+{
+	uint gid = get_global_id(0);
+	out[gid] = log(in[gid]);
+}
--- a/drivers/videocore4_stdlib/include/VC4CLStdLib.h
+++ b/drivers/videocore4_stdlib/include/VC4CLStdLib.h
@ -0,0 +1,77 @@
+/*
+ * General header for the VC4CLStdlib implementation, contains all required headers
+ *
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CLSTDLIB_H
+#define VC4CLSTDLIB_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "_config.h"
+#include "_extensions.h"
+#include "_conversions.h"
+#include "_common.h"
+#include "_math.h"
+#include "_integer.h"
+#include "_geometric.h"
+#include "_relational.h"
+#include "_work_items.h"
+#include "_vector.h"
+#include "_synchronization.h"
+#include "_async.h"
+#include "_atomics.h"
+#include "_images.h"
+#include "_printf.h"
+#include "_spir_mangling.h"
+#include "_clcxx_mangling.h"
+
+#undef ALL_BITS_SET
+#undef OVERLOADABLE
+#undef CONST
+#undef PURE
+#undef INLINE
+#undef FUNC_1
+#undef OVERLOAD_1
+#undef OVERLOAD_1_RETURN_SCALAR
+#undef FUNC_2
+#undef OVERLOAD_2
+#undef OVERLOAD_2_SCALAR
+#undef OVERLOAD_2_RETURN_SCALAR
+#undef OVERLOAD_2_SCALAR_RETURN_SCALAR
+#undef FUNC_3
+#undef OVERLOAD_3
+#undef OVERLOAD_3_SCALAR
+#undef FUNC_4
+#undef FUNC_5
+#undef SIMPLE_1
+#undef SIMPLE_1_RETURN_SCALAR
+#undef SIMPLE_2
+#undef SIMPLE_2_RETURN_SCALAR
+#undef SIMPLE_2_SCALAR
+#undef SIMPLE_3
+#undef SIMPLE_3_SCALAR
+#undef SIMPLE_3_TWO_SCALAR
+#undef COMPLEX_1
+#undef COMPLEX_1_RETURN_SCALAR
+#undef COMPLEX_2
+#undef COMPLEX_3
+#undef COMPLEX_3_SCALAR
+#undef OVERLOAD_ALL_IMAGE_TYPES
+#undef OVERLOAD_ALL_IMAGE_TYPES_1
+#undef OVERLOAD_ALL_IMAGE_TYPES_2
+#undef OVERLOAD_ALL_IMAGE_TYPES_3
+#undef OVERLOAD_ALL_IMAGE_TYPES_4
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VC4CLSTDLIB_H */
+
--- a/drivers/videocore4_stdlib/include/_async.h
+++ b/drivers/videocore4_stdlib/include/_async.h
@ -0,0 +1,245 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_ASYNC_H
+#define VC4CL_ASYNC_H
+
+#include "_config.h"
+#include "_overloads.h"
+
+
+/*
+ * This is a synchronous/blocking implementation.
+ * The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area.
+ * Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item
+ * (index 0, 0, 0). Idea taken from PoCL
+ */
+
+#define ASYNC_COPY_INTERNAL \
+		if(vc4cl_local_id(0) == 0) \
+		{ \
+			vc4cl_mutex_lock(); \
+			vc4cl_dma_copy(dst, src, num_elements); \
+			vc4cl_mutex_unlock(); \
+		}
+
+#define ASYNC_COPY(type) \
+		INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		}
+
+#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+		for (size_t i = 0; i < num_elements; ++i) \
+		dst[i] = src[i * src_stride];
+//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?
+
+#define ASYNC_STRIDED_DEST_COPY_INTERNAL \
+		for (size_t i = 0; i < num_elements; ++i) \
+		dst[i * dst_stride] = src[i];
+
+#define ASYNC_STRIDED_COPY(type) \
+		INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		} \
+		INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
+		{ \
+			ASYNC_STRIDED_DEST_COPY_INTERNAL \
+			return vc4cl_set_event(event); \
+		}
+
+#define PREFETCH(type) \
+		INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		} \
+		INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		} \
+		INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		} \
+		INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		} \
+		INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		} \
+		INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
+		{ \
+			vc4cl_prefetch(ptr, num_entries); \
+		}
+
+/*
+ * OpenCL 1.2, page 278:
+ * "Perform an async copy of num_gentypes gentype elements from src to dst.
+ *  The async copy is performed by all work-items in a work-group and this built-in
+ *  function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values."
+ */
+ASYNC_COPY(uchar)
+ASYNC_COPY(char)
+ASYNC_COPY(ushort)
+ASYNC_COPY(short)
+ASYNC_COPY(uint)
+ASYNC_COPY(int)
+ASYNC_COPY(float)
+
+ASYNC_STRIDED_COPY(uchar)
+ASYNC_STRIDED_COPY(char)
+ASYNC_STRIDED_COPY(ushort)
+ASYNC_STRIDED_COPY(short)
+ASYNC_STRIDED_COPY(uint)
+ASYNC_STRIDED_COPY(int)
+ASYNC_STRIDED_COPY(float)
+
+/*
+ * OpenCL 1.2, page 279:
+ * "Wait for events that identify the async_work_group_copy operations to complete.
+ *  The event objects specified in event_list will be released after the wait is performed."
+ */
+INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE
+{
+	// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
+	// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
+	barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+/*
+ * OpenCL 1.2, page 280:
+ * "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.
+ * The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel."
+ *
+ * -> Since it doesn't affect the functional behavior, the implementation is a no-op
+ */
+PREFETCH(uchar)
+PREFETCH(char)
+PREFETCH(ushort)
+PREFETCH(short)
+PREFETCH(uint)
+PREFETCH(int)
+PREFETCH(float)
+
+#undef ASYNC_COPY_INTERNAL
+#undef ASYNC_COPY
+#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL
+#undef ASYNC_STRIDED_DEST_COPY_INTERNAL
+#undef ASYNC_STRIDED_COPY
+#undef PREFETCH
+
+#endif /* VC4CL_ASYNC_H */
+
--- a/drivers/videocore4_stdlib/include/_atomics.h
+++ b/drivers/videocore4_stdlib/include/_atomics.h
@ -0,0 +1,659 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_ATOMICS_H
+#define VC4CL_ATOMICS_H
+
+#include "_config.h"
+#include "_overloads.h"
+#include "_intrinsics.h"
+
+INLINE int atomic_add(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_add(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_add(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_add(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_add(volatile __global int *ptr, int val) OVERLOADABLE
+{
+	return atomic_add(ptr, val);
+}
+
+INLINE unsigned int atom_add(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_add(ptr, val);
+}
+
+INLINE int atom_add(volatile __local int *ptr, int val) OVERLOADABLE
+{
+	return atomic_add(ptr, val);
+}
+
+INLINE unsigned int atom_add(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_add(ptr, val);
+}
+
+INLINE int atomic_sub(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_sub(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_sub(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_sub(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_sub(volatile __global int *ptr, int val) OVERLOADABLE
+{
+	return atomic_sub(ptr, val);
+}
+
+INLINE unsigned int atom_sub(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_sub(ptr, val);
+}
+
+INLINE int atom_sub(volatile __local int *ptr, int val) OVERLOADABLE
+{
+	return atomic_sub(ptr, val);
+}
+
+INLINE unsigned int atom_sub(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_sub(ptr, val);
+}
+
+INLINE int atomic_xchg(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE float atomic_xchg(volatile __global float * ptr, float val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	float old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_xchg(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE float atomic_xchg(volatile __local float * ptr, float val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	float old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_xchg(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE unsigned int atom_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE float atom_xchg(volatile __global float * ptr, float val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE int atom_xchg(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE unsigned int atom_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE float atom_xchg(volatile __local float * ptr, float val) OVERLOADABLE
+{
+	return atomic_xchg(ptr, val);
+}
+
+INLINE int atomic_inc(volatile __global int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_inc(volatile __global unsigned int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_inc(volatile __local int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_inc(volatile __local unsigned int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old + 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_inc(volatile __global int * ptr) OVERLOADABLE
+{
+	return atomic_inc(ptr);
+}
+
+INLINE unsigned int atom_inc(volatile __global unsigned int * ptr) OVERLOADABLE
+{
+	return atomic_inc(ptr);
+}
+
+INLINE int atom_inc(volatile __local int * ptr) OVERLOADABLE
+{
+	return atomic_inc(ptr);
+}
+
+INLINE unsigned int atom_inc(volatile __local unsigned int * ptr) OVERLOADABLE
+{
+	return atomic_inc(ptr);
+}
+
+INLINE int atomic_dec(volatile __global int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_dec(volatile __global unsigned int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_dec(volatile __local int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_dec(volatile __local unsigned int * ptr) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old - 1);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_dec(volatile __global int * ptr) OVERLOADABLE
+{
+	return atomic_dec(ptr);
+}
+
+INLINE unsigned int atom_dec(volatile __global unsigned int * ptr) OVERLOADABLE
+{
+	return atomic_dec(ptr);
+}
+
+INLINE int atom_dec(volatile __local int * ptr) OVERLOADABLE
+{
+	return atomic_dec(ptr);
+}
+
+INLINE unsigned int atom_dec(volatile __local unsigned int * ptr) OVERLOADABLE
+{
+	return atomic_dec(ptr);
+}
+
+INLINE int atomic_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, (old == compare) ? val : old);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, (old == compare) ? val : old);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, (old == compare) ? val : old);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, (old == compare) ? val : old);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
+{
+	return atomic_cmpxchg(ptr, compare, val);
+}
+
+INLINE unsigned int atom_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
+{
+	return atomic_cmpxchg(ptr, compare, val);
+}
+
+INLINE int atom_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
+{
+	return atomic_cmpxchg(ptr, compare, val);
+}
+
+INLINE unsigned int atom_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
+{
+	return atomic_cmpxchg(ptr, compare, val);
+}
+
+INLINE int atomic_min(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, min(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, min(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_min(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, min(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, min(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_min(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_min(ptr, val);
+}
+
+INLINE unsigned int atom_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_min(ptr, val);
+}
+
+INLINE int atom_min(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_min(ptr, val);
+}
+
+INLINE unsigned int atom_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_min(ptr, val);
+}
+
+INLINE int atomic_max(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, max(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, max(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_max(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, max(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, max(old, val));
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_max(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_max(ptr, val);
+}
+
+INLINE unsigned int atom_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_max(ptr, val);
+}
+
+INLINE int atom_max(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_max(ptr, val);
+}
+
+INLINE unsigned int atom_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_max(ptr, val);
+}
+
+INLINE int atomic_and(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old & val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old & val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_and(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old & val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old & val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_and(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_and(ptr, val);
+}
+
+INLINE unsigned int atom_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_and(ptr, val);
+}
+
+INLINE int atom_and(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_and(ptr, val);
+}
+
+INLINE unsigned int atom_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_and(ptr, val);
+}
+
+INLINE int atomic_or(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old | val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old | val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_or(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old | val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old | val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_or(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_or(ptr, val);
+}
+
+INLINE unsigned int atom_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_or(ptr, val);
+}
+
+INLINE int atom_or(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_or(ptr, val);
+}
+
+INLINE unsigned int atom_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_or(ptr, val);
+}
+
+INLINE int atomic_xor(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old ^ val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old ^ val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atomic_xor(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old ^ val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE unsigned int atomic_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	vc4cl_mutex_lock();
+	unsigned int old = vc4cl_dma_read(ptr);
+	vc4cl_dma_write(ptr, old ^ val);
+	vc4cl_mutex_unlock();
+	return old;
+}
+
+INLINE int atom_xor(volatile __global int * ptr, int val) OVERLOADABLE
+{
+	return atomic_xor(ptr, val);
+}
+
+INLINE unsigned int atom_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_xor(ptr, val);
+}
+
+INLINE int atom_xor(volatile __local int * ptr, int val) OVERLOADABLE
+{
+	return atomic_xor(ptr, val);
+}
+
+INLINE unsigned int atom_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
+{
+	return atomic_xor(ptr, val);
+}
+
+#endif /* VC4CL_ATOMICS_H */
+
--- a/drivers/videocore4_stdlib/include/_clcxx_mangling.h
+++ b/drivers/videocore4_stdlib/include/_clcxx_mangling.h
@ -0,0 +1,411 @@
+/*
+ * OpenCL 2.0 introduces the __generic address space, which is also used by C++ for OpenCL C.
+ *
+ * Since we do not actually care about address spaces(so far), we can just map those functions to one of the existing address spaces.
+ *
+ * Base list of affected functions generated with:
+ * llvm-dis -o /dev/stdout ../VC4CLStdLib/include/VC4CLStdLib.bc | grep -oE 'spir_func .?* \S*AS1.*?\)' | sort
+ *
+ * This header contains wrapper for the SPIR-mangled functions to the real implementations
+ * 
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+#ifndef VC4CL_GENERIC_MANGLING
+#define VC4CL_GENERIC_MANGLING
+
+#include "_config.h"
+
+float _Z4modffPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z4modffPU3AS1f")));
+float _Z5fractfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z5fractfPU3AS1f")));
+float _Z5frexpfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z5frexpfPU3AS1i")));
+float _Z6remquoffPU3AS4i(float, float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6remquoffPU3AS1i")));
+float _Z6sincosfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6sincosfPU3AS1f")));
+float _Z8lgamma_rfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8lgamma_rfPU3AS1i")));
+float2 _Z4modfDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z4modfDv2_fPU3AS1S_")));
+float2 _Z5fractDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z5fractDv2_fPU3AS1S_")));
+float2 _Z5frexpDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z5frexpDv2_fPU3AS1Dv2_i")));
+float2 _Z6remquoDv2_fS_PU3AS4Dv2_i(float2, float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z6remquoDv2_fS_PU3AS1Dv2_i")));
+float2 _Z6sincosDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z6sincosDv2_fPU3AS1S_")));
+float2 _Z8lgamma_rDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z8lgamma_rDv2_fPU3AS1Dv2_i")));
+float3 _Z4modfDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z4modfDv3_fPU3AS1S_")));
+float3 _Z5fractDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z5fractDv3_fPU3AS1S_")));
+float3 _Z5frexpDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z5frexpDv3_fPU3AS1Dv3_i")));
+float3 _Z6remquoDv3_fS_PU3AS4Dv3_i(float3, float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z6remquoDv3_fS_PU3AS1Dv3_i")));
+float3 _Z6sincosDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z6sincosDv3_fPU3AS1S_")));
+float3 _Z8lgamma_rDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z8lgamma_rDv3_fPU3AS1Dv3_i")));
+float4 _Z4modfDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z4modfDv4_fPU3AS1S_")));
+float4 _Z5fractDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z5fractDv4_fPU3AS1S_")));
+float4 _Z5frexpDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z5frexpDv4_fPU3AS1Dv4_i")));
+float4 _Z6remquoDv4_fS_PU3AS4Dv4_i(float4, float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z6remquoDv4_fS_PU3AS1Dv4_i")));
+float4 _Z6sincosDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z6sincosDv4_fPU3AS1S_")));
+float4 _Z8lgamma_rDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z8lgamma_rDv4_fPU3AS1Dv4_i")));
+float8 _Z4modfDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z4modfDv8_fPU3AS1S_")));
+float8 _Z5fractDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z5fractDv8_fPU3AS1S_")));
+float8 _Z5frexpDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z5frexpDv8_fPU3AS1Dv8_i")));
+float8 _Z6remquoDv8_fS_PU3AS4Dv8_i(float8, float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z6remquoDv8_fS_PU3AS1Dv8_i")));
+float8 _Z6sincosDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z6sincosDv8_fPU3AS1S_")));
+float8 _Z8lgamma_rDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z8lgamma_rDv8_fPU3AS1Dv8_i")));
+float16 _Z4modfDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z4modfDv16_fPU3AS1S_")));
+float16 _Z5fractDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z5fractDv16_fPU3AS1S_")));
+float16 _Z5frexpDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z5frexpDv16_fPU3AS1Dv16_i")));
+float16 _Z6remquoDv16_fS_PU3AS4Dv16_i(float16, float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z6remquoDv16_fS_PU3AS1Dv16_i")));
+float16 _Z6sincosDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z6sincosDv16_fPU3AS1S_")));
+float16 _Z8lgamma_rDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z8lgamma_rDv16_fPU3AS1Dv16_i")));
+
+char2 _Z6vload2jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kc")));
+uchar2 _Z6vload2jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kh")));
+short2 _Z6vload2jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ks")));
+ushort2 _Z6vload2jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kt")));
+int2 _Z6vload2jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ki")));
+uint2 _Z6vload2jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kj")));
+long2 _Z6vload2jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kl")));
+ulong2 _Z6vload2jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload2jPU3AS1Km")));
+float2 _Z6vload2jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kf")));
+char3 _Z6vload3jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kc")));
+uchar3 _Z6vload3jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kh")));
+short3 _Z6vload3jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ks")));
+ushort3 _Z6vload3jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kt")));
+int3 _Z6vload3jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ki")));
+uint3 _Z6vload3jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kj")));
+long3 _Z6vload3jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kl")));
+ulong3 _Z6vload3jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload3jPU3AS1Km")));
+float3 _Z6vload3jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kf")));
+char4 _Z6vload4jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kc")));
+uchar4 _Z6vload4jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kh")));
+short4 _Z6vload4jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ks")));
+ushort4 _Z6vload4jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kt")));
+int4 _Z6vload4jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ki")));
+uint4 _Z6vload4jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kj")));
+long4 _Z6vload4jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kl")));
+ulong4 _Z6vload4jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload4jPU3AS1Km")));
+float4 _Z6vload4jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kf")));
+char8 _Z6vload8jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kc")));
+uchar8 _Z6vload8jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kh")));
+short8 _Z6vload8jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ks")));
+ushort8 _Z6vload8jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kt")));
+int8 _Z6vload8jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ki")));
+uint8 _Z6vload8jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kj")));
+long8 _Z6vload8jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kl")));
+ulong8 _Z6vload8jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload8jPU3AS1Km")));
+float8 _Z6vload8jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kf")));
+char16 _Z7vload16jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kc")));
+uchar16 _Z7vload16jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kh")));
+short16 _Z7vload16jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ks")));
+ushort16 _Z7vload16jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kt")));
+int16 _Z7vload16jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ki")));
+uint16 _Z7vload16jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kj")));
+long16 _Z7vload16jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kl")));
+ulong16 _Z7vload16jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vload16jPU3AS1Km")));
+float16 _Z7vload16jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kf")));
+
+void _Z7vstore2Dv2_cjPU3AS4c(char2, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore2Dv2_cjPU3AS1c")));
+void _Z7vstore2Dv2_hjPU3AS4h(uchar2, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore2Dv2_hjPU3AS1h")));
+void _Z7vstore2Dv2_sjPU3AS4s(short2, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore2Dv2_sjPU3AS1s")));
+void _Z7vstore2Dv2_tjPU3AS4t(ushort2, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore2Dv2_tjPU3AS1t")));
+void _Z7vstore2Dv2_ijPU3AS4i(int2, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore2Dv2_ijPU3AS1i")));
+void _Z7vstore2Dv2_jjPU3AS4j(uint2, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore2Dv2_jjPU3AS1j")));
+void _Z7vstore2Dv2_ljPU3AS4l(long2, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore2Dv2_ljPU3AS1l")));
+void _Z7vstore2Dv2_mjPU3AS4m(ulong2, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore2Dv2_mjPU3AS1m")));
+void _Z7vstore2Dv2_fjPU3AS4f(float2, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore2Dv2_fjPU3AS1f")));
+void _Z7vstore3Dv3_cjPU3AS4c(char3, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore3Dv3_cjPU3AS1c")));
+void _Z7vstore3Dv3_hjPU3AS4h(uchar3, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore3Dv3_hjPU3AS1h")));
+void _Z7vstore3Dv3_sjPU3AS4s(short3, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore3Dv3_sjPU3AS1s")));
+void _Z7vstore3Dv3_tjPU3AS4t(ushort3, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore3Dv3_tjPU3AS1t")));
+void _Z7vstore3Dv3_ijPU3AS4i(int3, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore3Dv3_ijPU3AS1i")));
+void _Z7vstore3Dv3_jjPU3AS4j(uint3, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore3Dv3_jjPU3AS1j")));
+void _Z7vstore3Dv3_ljPU3AS4l(long3, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore3Dv3_ljPU3AS1l")));
+void _Z7vstore3Dv3_mjPU3AS4m(ulong3, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore3Dv3_mjPU3AS1m")));
+void _Z7vstore3Dv3_fjPU3AS4f(float3, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore3Dv3_fjPU3AS1f")));
+void _Z7vstore4Dv4_cjPU3AS4c(char4, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore4Dv4_cjPU3AS1c")));
+void _Z7vstore4Dv4_hjPU3AS4h(uchar4, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore4Dv4_hjPU3AS1h")));
+void _Z7vstore4Dv4_sjPU3AS4s(short4, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore4Dv4_sjPU3AS1s")));
+void _Z7vstore4Dv4_tjPU3AS4t(ushort4, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore4Dv4_tjPU3AS1t")));
+void _Z7vstore4Dv4_ijPU3AS4i(int4, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore4Dv4_ijPU3AS1i")));
+void _Z7vstore4Dv4_jjPU3AS4j(uint4, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore4Dv4_jjPU3AS1j")));
+void _Z7vstore4Dv4_ljPU3AS4l(long4, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore4Dv4_ljPU3AS1l")));
+void _Z7vstore4Dv4_mjPU3AS4m(ulong4, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore4Dv4_mjPU3AS1m")));
+void _Z7vstore4Dv4_fjPU3AS4f(float4, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore4Dv4_fjPU3AS1f")));
+void _Z7vstore8Dv8_cjPU3AS4c(char8, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore8Dv8_cjPU3AS1c")));
+void _Z7vstore8Dv8_hjPU3AS4h(uchar8, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore8Dv8_hjPU3AS1h")));
+void _Z7vstore8Dv8_sjPU3AS4s(short8, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore8Dv8_sjPU3AS1s")));
+void _Z7vstore8Dv8_tjPU3AS4t(ushort8, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore8Dv8_tjPU3AS1t")));
+void _Z7vstore8Dv8_ijPU3AS4i(int8, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore8Dv8_ijPU3AS1i")));
+void _Z7vstore8Dv8_jjPU3AS4j(uint8, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore8Dv8_jjPU3AS1j")));
+void _Z7vstore8Dv8_ljPU3AS4l(long8, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore8Dv8_ljPU3AS1l")));
+void _Z7vstore8Dv8_mjPU3AS4m(ulong8, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore8Dv8_mjPU3AS1m")));
+void _Z7vstore8Dv8_fjPU3AS4f(float8, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore8Dv8_fjPU3AS1f")));
+void _Z8vstore16Dv16_cjPU3AS4c(char16, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z8vstore16Dv16_cjPU3AS1c")));
+void _Z8vstore16Dv16_hjPU3AS4h(uchar16, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z8vstore16Dv16_hjPU3AS1h")));
+void _Z8vstore16Dv16_sjPU3AS4s(short16, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z8vstore16Dv16_sjPU3AS1s")));
+void _Z8vstore16Dv16_tjPU3AS4t(ushort16, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z8vstore16Dv16_tjPU3AS1t")));
+void _Z8vstore16Dv16_ijPU3AS4i(int16, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8vstore16Dv16_ijPU3AS1i")));
+void _Z8vstore16Dv16_jjPU3AS4j(uint16, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8vstore16Dv16_jjPU3AS1j")));
+void _Z8vstore16Dv16_ljPU3AS4l(long16, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z8vstore16Dv16_ljPU3AS1l")));
+void _Z8vstore16Dv16_mjPU3AS4m(ulong16, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z8vstore16Dv16_mjPU3AS1m")));
+void _Z8vstore16Dv16_fjPU3AS4f(float16, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z8vstore16Dv16_fjPU3AS1f")));
+
+int _Z10atomic_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vii")));
+uint _Z10atomic_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vjj")));
+int _Z8atom_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_andPU3AS1Vii")));
+uint _Z8atom_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_andPU3AS1Vjj")));
+int _Z9atomic_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vii")));
+uint _Z9atomic_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vjj")));
+int _Z7atom_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z7atom_orPU3AS1Vii")));
+uint _Z7atom_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z7atom_orPU3AS1Vjj")));
+int _Z10atomic_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vii")));
+uint _Z10atomic_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vjj")));
+int _Z8atom_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vii")));
+uint _Z8atom_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vjj")));
+int _Z10atomic_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vi")));
+uint _Z10atomic_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vj")));
+int _Z8atom_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vi")));
+uint _Z8atom_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vj")));
+int _Z10atomic_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vi")));
+uint _Z10atomic_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vj")));
+int _Z8atom_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vi")));
+uint _Z8atom_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vj")));
+int _Z10atomic_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vii")));
+uint _Z10atomic_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vjj")));
+int _Z8atom_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vii")));
+uint _Z8atom_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vjj")));
+int _Z10atomic_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vii")));
+uint _Z10atomic_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vjj")));
+int _Z8atom_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_minPU3AS1Vii")));
+uint _Z8atom_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_minPU3AS1Vjj")));
+int _Z10atomic_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vii")));
+uint _Z10atomic_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vjj")));
+int _Z8atom_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_addPU3AS1Vii")));
+uint _Z8atom_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_addPU3AS1Vjj")));
+int _Z10atomic_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vii")));
+uint _Z10atomic_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vjj")));
+int _Z8atom_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_subPU3AS1Vii")));
+uint _Z8atom_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_subPU3AS1Vjj")));
+int _Z11atomic_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vii")));
+uint _Z11atomic_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vjj")));
+float _Z11atomic_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vff")));
+int _Z9atom_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vii")));
+uint _Z9atom_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vjj")));
+float _Z9atom_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vff")));
+int _Z12atom_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Viii")));
+uint _Z12atom_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Vjjj")));
+int _Z14atomic_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Viii")));
+uint _Z14atomic_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Vjjj")));
+
+/*
+%opencl.event_t* _Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event")));
+%opencl.event_t* _Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event")));
+%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event")));
+
+TODO missing wait_group_events function(s)
+
+void _Z8prefetchPU3AS1Kcj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kcj")));
+void _Z8prefetchPU3AS1KDv16_cj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_cj")));
+void _Z8prefetchPU3AS1KDv16_fj(__attribute__((address_space(4))) float16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_fj")));
+void _Z8prefetchPU3AS1KDv16_hj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_hj")));
+void _Z8prefetchPU3AS1KDv16_ij(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_ij")));
+void _Z8prefetchPU3AS1KDv16_jj(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_jj")));
+void _Z8prefetchPU3AS1KDv16_sj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_sj")));
+void _Z8prefetchPU3AS1KDv16_tj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_tj")));
+void _Z8prefetchPU3AS1KDv2_cj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_cj")));
+void _Z8prefetchPU3AS1KDv2_fj(__attribute__((address_space(4))) float2*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_fj")));
+void _Z8prefetchPU3AS1KDv2_hj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_hj")));
+void _Z8prefetchPU3AS1KDv2_ij(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_ij")));
+void _Z8prefetchPU3AS1KDv2_jj(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_jj")));
+void _Z8prefetchPU3AS1KDv2_sj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_sj")));
+void _Z8prefetchPU3AS1KDv2_tj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_tj")));
+void _Z8prefetchPU3AS1KDv3_cj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_cj")));
+void _Z8prefetchPU3AS1KDv3_fj(__attribute__((address_space(4))) float3*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_fj")));
+void _Z8prefetchPU3AS1KDv3_hj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_hj")));
+void _Z8prefetchPU3AS1KDv3_ij(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_ij")));
+void _Z8prefetchPU3AS1KDv3_jj(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_jj")));
+void _Z8prefetchPU3AS1KDv3_sj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_sj")));
+void _Z8prefetchPU3AS1KDv3_tj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_tj")));
+void _Z8prefetchPU3AS1KDv4_cj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_cj")));
+void _Z8prefetchPU3AS1KDv4_fj(__attribute__((address_space(4))) float4*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_fj")));
+void _Z8prefetchPU3AS1KDv4_hj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_hj")));
+void _Z8prefetchPU3AS1KDv4_ij(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_ij")));
+void _Z8prefetchPU3AS1KDv4_jj(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_jj")));
+void _Z8prefetchPU3AS1KDv4_sj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_sj")));
+void _Z8prefetchPU3AS1KDv4_tj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_tj")));
+void _Z8prefetchPU3AS1KDv8_cj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_cj")));
+void _Z8prefetchPU3AS1KDv8_fj(__attribute__((address_space(4))) float8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_fj")));
+void _Z8prefetchPU3AS1KDv8_hj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_hj")));
+void _Z8prefetchPU3AS1KDv8_ij(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_ij")));
+void _Z8prefetchPU3AS1KDv8_jj(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_jj")));
+void _Z8prefetchPU3AS1KDv8_sj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_sj")));
+void _Z8prefetchPU3AS1KDv8_tj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_tj")));
+void _Z8prefetchPU3AS1Kfj(__attribute__((address_space(4))) float*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kfj")));
+void _Z8prefetchPU3AS1Khj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Khj")));
+void _Z8prefetchPU3AS1Kij(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kij")));
+void _Z8prefetchPU3AS1Kjj(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kjj")));
+void _Z8prefetchPU3AS1Ksj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ksj")));
+void _Z8prefetchPU3AS1Ktj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ktj")));
+*/
+#endif /* VC4CL_GENERIC_MANGLING */
--- a/drivers/videocore4_stdlib/include/_common.h
+++ b/drivers/videocore4_stdlib/include/_common.h
@ -0,0 +1,101 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_COMMON_H
+#define VC4CL_COMMON_H
+
+#include "_overloads.h"
+#include "_intrinsics.h"
+
+
+/*
+ * Common functions
+ *
+ * Some functions have no maximum error in the OpenCL specification, see here: https://github.com/KhronosGroup/OpenCL-Docs/issues/33
+ *
+ * degrees -> 2 ULP
+ * radians -> 2 ULP
+ * mix -> "implementation defined"
+ * smoothstep -> "implementation defined"
+ * clamp, min, max, step, sign -> 0 ULP
+ */
+
+SIMPLE_3(float, clamp, float, x, float, minval, float, maxval, fmin(fmax(x, minval), maxval))
+//TODO version with limits as scalar
+
+// NOTE: using 0x1.ca5dc2p+5 (= 180/M_PI_F + 1 ULP) is slightly more accurate than using 0x1.ca5dcp+5 (180 / M_PI_F),
+// but both are accurate enough for 2 ULP maximum error
+SIMPLE_1(float, degrees, float, radians, 0x1.ca5dc2p+5 * radians)
+
+// Results are undefined for one of the inputs NaN or Inf,
+// so we can directly call the intrinsic and don't need to handle these inputs explicitly
+SIMPLE_2(float, max, float, x, float, y, vc4cl_fmax(x, y))
+SIMPLE_2_SCALAR(float, max, float, x, float, y, vc4cl_fmax(x, y))
+
+SIMPLE_2(float, min, float, x, float, y, vc4cl_fmin(x, y))
+SIMPLE_2_SCALAR(float, min, float, x, float, y, vc4cl_fmin(x, y))
+
+//" Returns the linear blend of x and y implemented as:
+// x + (y - x) * a
+// a must be a value in the range 0.0 ... 1.0. If a is not in the range 0.0 ... 1.0, the return values are undefined. "
+
+SIMPLE_3(float, mix, float, x, float, y, float, a, x + (y - x) * a)
+SIMPLE_3_SCALAR(float, mix, float, x, float, y, float, a, x + (y - x) * a)
+
+SIMPLE_1(float, radians, float, degrees, (M_PI_F / 180) * degrees)
+
+SIMPLE_2(float, step, float, edge, float, val, val < edge ? 0.0f : 1.0f)
+INLINE float2 step(float edge, float2 val) OVERLOADABLE
+{
+	return step((float2)edge, val);
+}
+INLINE float3 step(float edge, float3 val) OVERLOADABLE
+{
+	return step((float3)edge, val);
+}
+INLINE float4 step(float edge, float4 val) OVERLOADABLE
+{
+	return step((float4)edge, val);
+}
+INLINE float8 step(float edge, float8 val) OVERLOADABLE
+{
+	return step((float8)edge, val);
+}
+INLINE float16 step(float edge, float16 val) OVERLOADABLE
+{
+	return step((float16)edge, val);
+}
+
+COMPLEX_3(float, smoothstep, float, edge0, float, edge1, float, val,
+{
+	result_t tmp = clamp((result_t) (val - edge0) / (edge1 - edge0), (result_t)0.0f, (result_t)1.0f);
+	return tmp * tmp * (3 - 2 * tmp);
+})
+INLINE float2 smoothstep(float edge0, float edge1, float2 val) OVERLOADABLE
+{
+	return smoothstep((float2)edge0, (float2)edge1, val);
+}
+INLINE float3 smoothstep(float edge0, float edge1, float3 val) OVERLOADABLE
+{
+	return smoothstep((float3)edge0, (float3)edge1, val);
+}
+INLINE float4 smoothstep(float edge0, float edge1, float4 val) OVERLOADABLE
+{
+	return smoothstep((float4)edge0, (float4)edge1, val);
+}
+INLINE float8 smoothstep(float edge0, float edge1, float8 val) OVERLOADABLE
+{
+	return smoothstep((float8)edge0, (float8)edge1, val);
+}
+INLINE float16 smoothstep(float edge0, float edge1, float16 val) OVERLOADABLE
+{
+	return smoothstep((float16)edge0, (float16)edge1, val);
+}
+
+SIMPLE_1(float, sign, float, val, val > 0.0f ? 1.0f : val < 0.0f ? -1.0f : 0.0f)
+
+#endif /* VC4CL_COMMON_H */
+
--- a/drivers/videocore4_stdlib/include/_config.h
+++ b/drivers/videocore4_stdlib/include/_config.h
@ -0,0 +1,30 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_CONFIG_H
+#define VC4CL_CONFIG_H
+
+#include "defines.h"
+
+#include "opencl-c.h"
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+/*
+ * Math constants
+ */
+#define M_LOG210    3.01029995663981195214f /* log_2(10) */
+#undef NAN
+#define NAN         0x7fffffffU /* same as defined in OpenCL C, but as integer */
+#undef INF
+#define INF			0x7f800000U
+
+#define ALL_BITS_SET 0xFFFFFFFFU
+
+#endif /* VC4CL_CONFIG_H */
+
--- a/drivers/videocore4_stdlib/include/_conversions.h
+++ b/drivers/videocore4_stdlib/include/_conversions.h
--- a/drivers/videocore4_stdlib/include/_extensions.h
+++ b/drivers/videocore4_stdlib/include/_extensions.h
@ -0,0 +1,173 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_EXTENSIONS_H
+#define VC4CL_EXTENSIONS_H
+
+#include "_config.h"
+#include "_overloads.h"
+#include "_intrinsics.h"
+
+
+/*
+ * Loop unroll pragma extension
+ *
+ * Defines "#pragma unroll <factor>"
+ *
+ * CLang supports this natively, so we do not need to do anything
+ *
+ * See https://www.khronos.org/registry/OpenCL/extensions/nv/cl_nv_pragma_unroll.txt
+ * See https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll
+ */
+#ifndef cl_nv_pragma_unroll
+#define cl_nv_pragma_unroll 1
+#endif
+
+/*
+ * ARM core-ID extension
+ *
+ * Adds function
+ * 	uint arm_get_core_id( void )
+ * which returns the ID of the OpenCL Computation Unit, which is always zero
+ *
+ * See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_get_core_id.txt
+ */
+#ifndef cl_arm_core_id
+#define cl_arm_core_id 1
+#endif
+uint arm_get_core_id(void);	//prototype, prevents warning
+uint arm_get_core_id(void)
+{
+	return 0;
+}
+
+/*
+ * 32-bit atomic counters
+ *
+ * Adds type
+ *  counter_32_t
+ * which is a 32-bit type for atomic counters. counter32_t can only be passed as kernel parameter and cannot be read/assigned.
+ *
+ * Adds functions
+ *  uint atomic_inc(counter32_t counter)
+ *  uint atomic_dec(counter32_t counter)
+ * increments/decrements the given counter32_t value atomically.
+ *
+ * NOTE: Since the syntax/semantics is exactly the same as for the uint version of the standard atomic_inc/atomic_dec functions, counter32_t is used as typedef to an uint pointer.
+ *
+ * See https://www.khronos.org/registry/OpenCL/extensions/ext/cl_ext_atomic_counters_32.txt
+ */
+#ifndef cl_ext_atomic_counters_32
+#define cl_ext_atomic_counters_32 1
+#endif
+typedef volatile __global uint* counter32_t;
+//just the prototypes, the implementations reside in _atomics.h
+uint atomic_inc(counter32_t counter) OVERLOADABLE;
+uint atomic_dec(counter32_t counter) OVERLOADABLE;
+
+/*
+ * Integer dot products
+ *
+ * Adds functions
+ *  int arm_dot(char4 a, char4 b)
+ *  uint arm_dot(uchar4 a, uchar4 b)
+ *  int arm_dot_acc(char4 a, char4 b, int acc)
+ *  uint arm_dot_acc(uchar4 a, uchar4 b, uint acc)
+ *  int arm_dot_acc(short2 a, short2 b, int acc)
+ *  uint arm_dot_acc(ushort2 a, ushort2 b, uint acc)
+ *  int arm_dot_acc_sat(char4 a, char4 b, int acc)
+ *  uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc)
+ * calculate integer dot product (and additionally adds the scalar value).
+ * For the functions xxx_sat, the final addition is saturating.
+ *
+ * See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_integer_dot_product.txt
+ */
+#ifndef cl_arm_integer_dot_product_int8
+#define cl_arm_integer_dot_product_int8 1
+#endif
+#ifndef cl_arm_integer_dot_product_accumulate_int8
+#define cl_arm_integer_dot_product_accumulate_int8 1
+#endif
+#ifndef cl_arm_integer_dot_product_accumulate_int16
+#define cl_arm_integer_dot_product_accumulate_int16 1
+#endif
+#ifndef cl_arm_integer_dot_product_accumulate_saturate_int8
+#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
+#endif
+
+// prototypes to prevent warnings
+int arm_dot(char4 a, char4 b)  OVERLOADABLE;
+uint arm_dot(uchar4 a, uchar4 b)  OVERLOADABLE;
+int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE;
+uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
+int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE;
+uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE;
+int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE;
+uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
+
+/**
+ * (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w)
+ */
+int arm_dot(char4 a, char4 b)  OVERLOADABLE CONST
+{
+	int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
+	return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
+}
+uint arm_dot(uchar4 a, uchar4 b)  OVERLOADABLE CONST
+{
+	uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
+	return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
+}
+
+/**
+ * acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
+ */
+int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE CONST
+{
+	int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
+	return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
+}
+
+uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
+{
+	uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
+	return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
+}
+
+/**
+ * acc + [ (a.x * b.x) + (a.y * b.y) ]
+ */
+int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE CONST
+{
+	int2 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
+	return acc + tmp.s0 + tmp.s1;
+}
+
+uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE CONST
+{
+	uint2 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
+	return acc + tmp.s0 + tmp.s1;
+}
+
+/**
+ * acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
+ *
+ * The final accumulation is saturating.
+ */
+int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE CONST
+{
+	int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
+	return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
+}
+
+uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
+{
+	uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
+	return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
+}
+
+#endif /* VC4CL_EXTENSIONS_H */
+
--- a/drivers/videocore4_stdlib/include/_float_float.h
+++ b/drivers/videocore4_stdlib/include/_float_float.h
@ -0,0 +1,121 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+/*
+ * Implements a float-float floating point type providing improved accuracy over float32.
+ *
+ * Algorithms and ideas taken from:
+ * - Guillaume da Gracca, David Defour. Implementation of float-float operators on graphics hardware. Real Numbers and
+ *   Computers 7, Jul 2006, Nancy, France. pp.23-32. hal-00021443
+ *   https://hal.archives-ouvertes.fr/hal-00021443 (https://hal.archives-ouvertes.fr/hal-00021443/document)
+ * - https://andrewthall.org/papers/df64_qf128.pdf
+ */
+#ifndef VC4CL_FLOAT_FLOAT_H
+#define VC4CL_FLOAT_FLOAT_H
+
+#include "_intrinsics.h"
+
+/**
+ * Type for extended precision floating point values.
+ *
+ * By combining two 32-bit floats, greatly increases accuracy. Value range is not increased!
+ *
+ * The "real" value calculates as UPPER + LOWER part.
+ *
+ * Using a native 64-bit type implicitly provides vector versions (and proper handling by compiler)
+ */
+typedef ulong FloatFloat;
+typedef ulong2 FloatFloat2;
+typedef ulong3 FloatFloat3;
+typedef ulong4 FloatFloat4;
+typedef ulong8 FloatFloat8;
+typedef ulong16 FloatFloat16;
+
+SIMPLE_1(float, vc4cl_upper, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val)))
+SIMPLE_1(float, vc4cl_lower, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val >> 32)))
+SIMPLE_1(float, vc4cl_lossy, FloatFloat, val, vc4cl_upper(val) + vc4cl_lower(val))
+
+COMPLEX_2(FloatFloat, vc4cl_combine, float, upper, float, lower, {
+	result_t upper_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(upper));
+	result_t lower_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(lower));
+	return upper_extended | (lower_extended << 32);
+})
+
+// faster version of vc4cl_combine(val, 0)
+SIMPLE_1(FloatFloat, vc4cl_extend, float, val, vc4cl_int_to_ulong(vc4cl_bitcast_uint(val)))
+
+// TODO avoid using this, since it runs against Inf, due to calculating val * 2^15
+COMPLEX_1(FloatFloat, vc4cl_split, float, val, {
+	// 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
+	const float split = (float) (1u << 15); // TODO can be modified for precision
+	arg_t c = (split + 1) * val;
+	arg_t high = c - (c - val);
+	arg_t low = val - high;
+	return vc4cl_combine(high, low);
+})
+
+// COMPLEX_1(FloatFloat, vc4cl_split, double, val, {
+// 	// 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
+// 	const double split = (double) (1u << 29); // TODO can be modified for precision
+// 	arg_t c = (split + 1) * val;
+// 	arg_t high = c - (c - val);
+// 	arg_t low = val - high;
+// 	return vc4cl_combine(high, low);
+// })
+
+COMPLEX_2(FloatFloat, vc4cl_add, float, a, float, b, {
+	float_t s = a + b;
+	float_t v = s - a;
+	float_t e = (a - (s - v)) + (b - v);
+	return vc4cl_combine(s, e);
+})
+
+COMPLEX_2(FloatFloat, vc4cl_add, FloatFloat, a, FloatFloat, b, {
+	float_t r = vc4cl_upper(a) + vc4cl_upper(b);
+	float_t s0 = (((vc4cl_upper(a) - r) + vc4cl_upper(b)) + vc4cl_lower(b)) + vc4cl_lower(a);
+	float_t s1 = (((vc4cl_upper(b) - r) + vc4cl_upper(a)) + vc4cl_lower(a)) + vc4cl_lower(b);
+	float_t s = fabs(vc4cl_upper(a)) >= fabs(vc4cl_upper(b)) ? s0 : s1;
+	return vc4cl_add(r, s);
+})
+
+SIMPLE_2(FloatFloat, vc4cl_sub, FloatFloat, a, FloatFloat, b, vc4cl_add(a, vc4cl_combine(-vc4cl_upper(b), -vc4cl_lower(b))))
+
+COMPLEX_2(FloatFloat, vc4cl_mul, float, a, float, b, {
+	float_t x = a * b;
+	result_t a_split = vc4cl_split(a);
+	result_t b_split = vc4cl_split(b);
+	float_t error1 = x - (vc4cl_upper(a_split) * vc4cl_upper(b_split));
+	float_t error2 = error1 - (vc4cl_lower(a_split) * vc4cl_upper(b_split));
+	float_t error3 = error2 - (vc4cl_upper(a_split) * vc4cl_lower(b_split));
+	float_t y = vc4cl_lower(a_split) * vc4cl_lower(b_split) - error3;
+	return vc4cl_combine(x, y);
+})
+
+COMPLEX_2(FloatFloat, vc4cl_mul, FloatFloat, a, FloatFloat, b, {
+	result_t t = vc4cl_mul(vc4cl_upper(a), vc4cl_upper(b));
+	float_t t1 = vc4cl_upper(a) * vc4cl_lower(b) + vc4cl_lower(a) * vc4cl_upper(b) + vc4cl_lower(t);
+	return vc4cl_add(vc4cl_upper(t), t1);
+})
+
+COMPLEX_2(FloatFloat, vc4cl_div, FloatFloat, a, FloatFloat, b, {
+	float_t xn = 1.0f / vc4cl_upper(b);
+	float_t yn = vc4cl_upper(a) * xn;
+	result_t y = vc4cl_extend(yn);
+	float_t diff = vc4cl_upper(vc4cl_sub(a, vc4cl_mul(b, y)));
+	result_t prod = vc4cl_mul(xn, diff);
+	return vc4cl_add(y, prod);
+})
+
+COMPLEX_1(FloatFloat, vc4cl_sqrt, FloatFloat, a, {
+	float_t xn = rsqrt(vc4cl_upper(a));
+	float_t yn = vc4cl_upper(a) * xn;
+	result_t y = vc4cl_extend(yn);
+	result_t ynsqr = vc4cl_mul(y, y); // yn^2
+	float_t diff = vc4cl_upper(vc4cl_sub(a, ynsqr));
+	result_t prod = vc4cl_mul(xn, diff) / 2;
+	return vc4cl_add(y, prod);
+})
+
+#endif /* VC4CL_FLOAT_FLOAT_H */
--- a/drivers/videocore4_stdlib/include/_geometric.h
+++ b/drivers/videocore4_stdlib/include/_geometric.h
@ -0,0 +1,93 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_GEOMETRY_H
+#define VC4CL_GEOMETRY_H
+
+#include "_config.h"
+#include "_overloads.h"
+
+/*          a0   b0   a2 * b3 - a3 * b2
+ * a x b =  a1 x b1 = a3 * b1 - a1 * b3
+ *          a2   b2   a1 * b2 - a2 * b1
+ */
+INLINE float3 cross(float3 p0, float3 p1) OVERLOADABLE CONST
+{
+	return (float3) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x);
+}
+
+INLINE float4 cross(float4 p0, float4 p1) OVERLOADABLE CONST
+{
+	return (float4) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x, 0.0f);
+}
+
+/*         a0   b0
+ * a * b = a1 * b1 = a1 * b1 + a2 * b2 + a3 * b3
+ *         a2   b2
+ */
+INLINE float dot(float p0, float p1) OVERLOADABLE CONST
+{
+	return p0 * p1;
+}
+
+INLINE float dot(float2 p0, float2 p1) OVERLOADABLE CONST
+{
+	const float2 tmp = p0 * p1;
+	return tmp.x + tmp.y;
+}
+
+INLINE float dot(float3 p0, float3 p1) OVERLOADABLE CONST
+{
+	const float3 tmp = p0 * p1;
+	return tmp.x + tmp.y + tmp.z;
+}
+
+INLINE float dot(float4 p0, float4 p1) OVERLOADABLE CONST
+{
+	const float4 tmp = p0 * p1;
+	return tmp.x + tmp.y + tmp.z + tmp.w;
+}
+
+float dot(float8 p0, float8 p1) OVERLOADABLE CONST;
+float dot(float16 p0, float16 p1) OVERLOADABLE CONST;
+
+COMPLEX_1_RETURN_SCALAR(float, length, float, p, {
+	float tmp = dot(p, p);
+
+	// To mitigate overflow errors for edge-cases, reduce large/increase small numbers, this is taken from LLVM libclc
+	// E.g. since dot(x, x) calculates element-wise x^2, every exponent >= 64 goes to Infinity and every exponent <= -64 to zero!
+	float inputFactor = 1.0f;
+	float outputFactor = 1.0f;
+	outputFactor = tmp == INFINITY ? 0x1.0p+65f : outputFactor;
+	inputFactor = tmp == INFINITY ? 0x1.0p-65f : inputFactor;
+	outputFactor = vc4cl_is_zero(tmp) ? 0x1.0p-86f : outputFactor;
+	inputFactor = vc4cl_is_zero(tmp) ? 0x1.0p+86f : inputFactor;
+
+	return sqrt(dot(p * inputFactor, p * inputFactor)) * outputFactor;
+})
+
+//"Returns the distance between p0 and p1.
+// This is calculated as length(p0 - p1).
+SIMPLE_2_RETURN_SCALAR(float, distance, float, p0, float, p1, length(p0 - p1))
+
+/**
+ * Expected behavior:
+ *
+ * normalize(v) = v for all elements in v = 0
+ * normalize(v) = vector of NaNs for all elements in v = NaN
+ * TODO special case for Inf elements
+ */
+SIMPLE_1(float, normalize, float, p, p / length(p))
+
+
+SIMPLE_1_RETURN_SCALAR(float, fast_length, float, p, half_sqrt(dot(p, p)))
+
+SIMPLE_2_RETURN_SCALAR(float, fast_distance, float, p0, float, p1, fast_length(p0 - p1))
+
+SIMPLE_1(float, fast_normalize, float, p, p * half_rsqrt(dot(p, p)))
+
+#endif /* VC4CL_GEOMETRY_H */
+
--- a/drivers/videocore4_stdlib/include/_images.h
+++ b/drivers/videocore4_stdlib/include/_images.h
--- a/drivers/videocore4_stdlib/include/_integer.h
+++ b/drivers/videocore4_stdlib/include/_integer.h
@ -0,0 +1,233 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_INTEGER_H
+#define VC4CL_INTEGER_H
+
+#include "_config.h"
+#include "_intrinsics.h"
+
+#define SIMPLE_INTEGER_2(func, argName0, argName1, content) \
+		SIMPLE_2(uchar, func, uchar, argName0, uchar, argName1, content) \
+		SIMPLE_2(char, func, char, argName0, char, argName1, content) \
+		SIMPLE_2(ushort, func, ushort, argName0, ushort, argName1, content) \
+		SIMPLE_2(short, func, short, argName0, short, argName1, content) \
+		SIMPLE_2(uint, func, uint, argName0, uint, argName1, content) \
+		SIMPLE_2(int, func, int, argName0, int, argName1, content) \
+
+#define SIMPLE_INTEGER_3(func, argName0, argName1, argName2, content) \
+		SIMPLE_3(uchar, func, uchar, argName0, uchar, argName1, uchar, argName2, content) \
+		SIMPLE_3(char, func, char, argName0, char, argName1, char, argName2, content) \
+		SIMPLE_3(ushort, func, ushort, argName0, ushort, argName1, ushort, argName2, content) \
+		SIMPLE_3(short, func, short, argName0, short, argName1, short, argName2, content) \
+		SIMPLE_3(uint, func, uint, argName0, uint, argName1, uint, argName2, content) \
+		SIMPLE_3(int, func, int, argName0, int, argName1, int, argName2, content) \
+
+
+SIMPLE_1(uchar, abs, char, val, vc4cl_bitcast_uchar(max(vc4cl_extend(val), -vc4cl_extend(val))))
+SIMPLE_1(uchar, abs, uchar, val, val)
+SIMPLE_1(ushort, abs, short, val, vc4cl_bitcast_ushort(max(vc4cl_extend(val), -vc4cl_extend(val))))
+SIMPLE_1(ushort, abs, ushort, val, val)
+SIMPLE_1(uint, abs, int, val, vc4cl_bitcast_uint(max(val, -val)))
+SIMPLE_1(uint, abs, uint, val, val)
+SIMPLE_1(ulong, abs, long, val, vc4cl_bitcast_ulong(max(val, -val)))
+SIMPLE_1(ulong, abs, ulong, val, val)
+
+//based on pocl (pocl/lib/kernel/abs_diff.cl)
+SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x))
+COMPLEX_2(uchar, abs_diff, char, x, char, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = (result_t)abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
+SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x))
+COMPLEX_2(ushort, abs_diff, short, x, short, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = (result_t)abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
+SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x))
+COMPLEX_2(uint, abs_diff, int, x, int, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
+SIMPLE_2(ulong, abs_diff, ulong, x, ulong, y, abs(x > y ? x - y : y - x))
+COMPLEX_2(ulong, abs_diff, long, x, long, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
+
+SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y))
+SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
+SIMPLE_2(ushort, add_sat, ushort, x, ushort, y, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) + vc4cl_extend(y), (uint) 0, (uint) USHRT_MAX)))
+SIMPLE_2(short, add_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) + vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
+//based on pocl (pocl/lib/kernel/add_sat.cl)
+SIMPLE_2(uint, add_sat, uint, x, uint, y, x > ((result_t)UINT_MAX) - y ? UINT_MAX : x + y)
+SIMPLE_2(int, add_sat, int, x, int, y, vc4cl_saturated_add(x, y))
+
+//"Returns (x + y) >> 1.  The intermediate sum does not modulo overflow."
+SIMPLE_2(uchar, hadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
+SIMPLE_2(char, hadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
+SIMPLE_2(ushort, hadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
+SIMPLE_2(short, hadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
+//based on pocl (pocl/lib/kernel/hadd.cl)
+SIMPLE_2(uint, hadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
+SIMPLE_2(int, hadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
+SIMPLE_2(ulong, hadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
+SIMPLE_2(long, hadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
+
+//"Returns (x + y + 1) >> 1.  The intermediate sum does not modulo overflow."
+SIMPLE_2(uchar, rhadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
+SIMPLE_2(char, rhadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
+SIMPLE_2(ushort, rhadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
+SIMPLE_2(short, rhadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
+//based on pocl (pocl/lib/kernel/rhadd.cl)
+SIMPLE_2(uint, rhadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
+SIMPLE_2(int, rhadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
+SIMPLE_2(ulong, rhadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
+SIMPLE_2(long, rhadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
+
+SIMPLE_INTEGER_3(clamp, val, minval, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(uchar, clamp, uchar, val, uchar, minval, uchar, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(char, clamp, char, val, char, minval, char, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(ushort, clamp, ushort, val, ushort, minval, ushort, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(short, clamp, short, val, short, minval, short, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(uint, clamp, uint, val, uint, minval, uint, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(int, clamp, int, val, int, minval, int, maxval, min(max(val, minval), maxval))
+SIMPLE_3(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
+SIMPLE_3(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
+SIMPLE_3_TWO_SCALAR(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
+
+SIMPLE_1(uchar, clz, uchar, x, vc4cl_bitcast_uchar(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
+SIMPLE_1(char, clz, char, x, vc4cl_bitcast_char(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
+SIMPLE_1(ushort, clz, ushort, x, vc4cl_bitcast_ushort(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
+SIMPLE_1(short, clz, short, x, vc4cl_bitcast_short(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
+SIMPLE_1(uint, clz, uint, x, vc4cl_bitcast_uint(vc4cl_clz(x)))
+SIMPLE_1(int, clz, int, x, vc4cl_bitcast_int(vc4cl_clz(x)))
+
+SIMPLE_INTEGER_3(mad_hi, x, y, z, mul_hi(x, y) + z)
+
+SIMPLE_3(uchar, mad_sat, uchar, x, uchar, y, uchar, z, vc4cl_bitcast_uchar(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) UCHAR_MAX)))
+SIMPLE_3(char, mad_sat, char, x, char, y, char, z, vc4cl_bitcast_char(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) CHAR_MIN, (int) CHAR_MAX)))
+SIMPLE_3(ushort, mad_sat, ushort, x, ushort, y, ushort, z, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) USHRT_MAX)))
+SIMPLE_3(short, mad_sat, short, x, short, y, short, z, vc4cl_bitcast_short(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) SHRT_MIN, (int) SHRT_MAX)))
+SIMPLE_3(uint, mad_sat, uint, x, uint, y, uint, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_UNSIGNED) + vc4cl_int_to_ulong(z), VC4CL_UNSIGNED))
+SIMPLE_3(int, mad_sat, int, x, int, y, int, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_SIGNED) + vc4cl_int_to_long(z), VC4CL_SIGNED))
+
+SIMPLE_2(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
+SIMPLE_2_SCALAR(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
+SIMPLE_2(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2_SCALAR(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
+SIMPLE_2_SCALAR(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
+SIMPLE_2(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2_SCALAR(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2(uint, max, uint, x, uint, y, x > y ? x : y)
+SIMPLE_2_SCALAR(uint, max, uint, x, uint, y, x > y ? x : y)
+SIMPLE_2(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
+SIMPLE_2_SCALAR(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
+COMPLEX_2(ulong, max, ulong, x, ulong, y,
+{
+	uint_t upX = vc4cl_long_to_int(x >> 32);
+	uint_t upY = vc4cl_long_to_int(y >> 32);
+	uint_t lowX = vc4cl_long_to_int(x);
+	uint_t lowY = vc4cl_long_to_int(y);
+
+	/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
+	int_t selection = upX > upY ? 0 : (upX < upY ? 1 : (lowX > lowY ? 0 : 1));
+	return vc4cl_int_to_long(selection) == 0 ? x : y;
+})
+SIMPLE_2_SCALAR(ulong, max, ulong, x, ulong, y, max(x, (arg0_t) y))
+SIMPLE_2(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
+SIMPLE_2_SCALAR(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
+
+SIMPLE_2(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
+SIMPLE_2_SCALAR(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
+SIMPLE_2(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2_SCALAR(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
+SIMPLE_2_SCALAR(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
+SIMPLE_2(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2_SCALAR(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
+SIMPLE_2(uint, min, uint, x, uint, y, x < y ? x : y)
+SIMPLE_2_SCALAR(uint, min, uint, x, uint, y, x < y ? x : y)
+SIMPLE_2(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
+SIMPLE_2_SCALAR(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
+COMPLEX_2(ulong, min, ulong, x, ulong, y,
+{
+	uint_t upX = vc4cl_long_to_int(x >> 32);
+	uint_t upY = vc4cl_long_to_int(y >> 32);
+	uint_t lowX = vc4cl_long_to_int(x);
+	uint_t lowY = vc4cl_long_to_int(y);
+
+	/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
+	int_t selection = upX < upY ? 0 : (upX > upY ? 1 : (lowX < lowY ? 0 : 1));
+	return vc4cl_int_to_long(selection) == 0 ? x : y;
+})
+SIMPLE_2_SCALAR(ulong, min, ulong, x, ulong, y, min(x, (arg0_t) y))
+SIMPLE_2(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
+SIMPLE_2_SCALAR(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
+
+SIMPLE_2(uchar, mul_hi, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 8))
+SIMPLE_2(char, mul_hi, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_mul24(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED), 8)))
+SIMPLE_2(ushort, mul_hi, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 16))
+SIMPLE_2(short, mul_hi, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_sign_extend(x) * vc4cl_sign_extend(y), 16)))
+SIMPLE_2(uint, mul_hi, uint, x, uint, y, vc4cl_mul_hi(x, y, VC4CL_UNSIGNED))
+SIMPLE_2(int, mul_hi, int, x, int, y, vc4cl_mul_hi(x, y, VC4CL_SIGNED))
+
+//Since the rotation is over all 32-bits, for smaller types we need to replicate the value, rotate it and truncate/sign extend the result afterwards
+SIMPLE_2(uchar, rotate, uchar, x, uchar, y, vc4cl_pack_lsb(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
+SIMPLE_2(char, rotate, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_extend(y)), 24)))
+SIMPLE_2(ushort, rotate, ushort, x, ushort, y, vc4cl_pack_truncate(vc4cl_ror(vc4cl_zero_extend(x) | (vc4cl_zero_extend(x) << 16), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
+SIMPLE_2(short, rotate, short, x, short, y, vc4cl_bitcast_short(vc4cl_extend(vc4cl_bitcast_short(vc4cl_ror((vc4cl_sign_extend(x) & (int) 0xFFFF) | (vc4cl_sign_extend(x) << 16), -vc4cl_sign_extend(y))))))
+SIMPLE_2(uint, rotate, uint, x, uint, y, vc4cl_bitcast_uint(vc4cl_ror(x, -vc4cl_bitcast_int(y))))
+SIMPLE_2(int, rotate, int, x, int, y, vc4cl_bitcast_int(vc4cl_ror(x, -y)))
+
+SIMPLE_2(uchar, sub_sat, uchar, x, uchar, y, vc4cl_v8subs(x, y))
+SIMPLE_2(char, sub_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) - vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
+SIMPLE_2(ushort, sub_sat, ushort, x, ushort, y, x < y ? (result_t)0 : x - y)
+SIMPLE_2(short, sub_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) - vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
+//based on pocl (pocl/lib/kernel/sub_sat.cl)
+SIMPLE_2(uint, sub_sat, uint, x, uint, y, x < y ? (result_t)0 : x - y)
+SIMPLE_2(int, sub_sat, int, x, int, y, vc4cl_saturated_sub(x, y))
+
+SIMPLE_2(short, upsample, char, hi, uchar, lo, vc4cl_bitcast_short((vc4cl_sign_extend(hi) << 8) | vc4cl_bitcast_int(vc4cl_zero_extend(lo))))
+SIMPLE_2(ushort, upsample, uchar, hi, uchar, lo, vc4cl_bitcast_ushort((vc4cl_zero_extend(hi) << 8) | vc4cl_zero_extend(lo)))
+SIMPLE_2(int, upsample, short, hi, ushort, lo, (vc4cl_sign_extend(hi) << 16) | vc4cl_bitcast_int(vc4cl_zero_extend(lo)))
+SIMPLE_2(uint, upsample, ushort, hi, ushort, lo, (vc4cl_zero_extend(hi) << 16) | vc4cl_zero_extend(lo))
+SIMPLE_2(long, upsample, int, hi, uint, lo, (vc4cl_int_to_long(hi) << 32) | vc4cl_bitcast_long(vc4cl_int_to_ulong(lo)))
+SIMPLE_2(ulong, upsample, uint, hi, uint, lo, (vc4cl_int_to_ulong(hi) << 32) | vc4cl_int_to_ulong(lo))
+
+//" Returns the number of non-zero bits in x. "
+SIMPLE_1(uchar, popcount, uchar, val, vc4cl_popcount(val))
+SIMPLE_1(char, popcount, char, val, vc4cl_popcount(val))
+SIMPLE_1(ushort, popcount, ushort, val, vc4cl_popcount(val))
+SIMPLE_1(short, popcount, short, val, vc4cl_popcount(val))
+SIMPLE_1(uint, popcount, uint, val, vc4cl_popcount(val))
+SIMPLE_1(int, popcount, int, val, vc4cl_popcount(val))
+SIMPLE_1(ulong, popcount, ulong, val, vc4cl_popcount(val))
+SIMPLE_1(long, popcount, long, val, vc4cl_popcount(val))
+
+SIMPLE_2(uchar, mul24, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
+SIMPLE_2(char, mul24, char, x, char, y, vc4cl_bitcast_char(vc4cl_mul24(x, y, VC4CL_SIGNED)))
+SIMPLE_2(ushort, mul24, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
+SIMPLE_2(short, mul24, short, x, short, y, vc4cl_bitcast_short(vc4cl_mul24(x, y, VC4CL_SIGNED)))
+SIMPLE_2(uint, mul24, uint, x, uint, y, vc4cl_mul24(x, y, VC4CL_UNSIGNED))
+SIMPLE_2(int, mul24, int, x, int, y, vc4cl_mul24(x, y, VC4CL_SIGNED))
+SIMPLE_INTEGER_3(mad24, a, b, c, mul24(a, b) + c)
+
+#undef SIMPLE_INTEGER_2
+#undef SIMPLE_INTEGER_3
+
+#endif /* VC4CL_INTEGER_H */
+
--- a/drivers/videocore4_stdlib/include/_intrinsics.h
+++ b/drivers/videocore4_stdlib/include/_intrinsics.h
@ -0,0 +1,436 @@
+/* Declares interfaces for all intrinsic functions
+ *
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+
+#ifndef VC4CL_INTRINSICS_H
+#define VC4CL_INTRINSICS_H
+
+#include "_overloads.h"
+
+#define VC4CL_SIGNED 0
+#define VC4CL_UNSIGNED 1
+
+/*
+ * ALU operations
+ *
+ * NOTE: These operations directly map to the machine instructions and do not
+ * heed other data-types (e.g. vc4cl_clz will always return the leading zeroes to
+ * full 32-bit width)
+ */
+OVERLOAD_2(float, vc4cl_fmax, float, x, float, y)
+OVERLOAD_2(float, vc4cl_fmin, float, x, float, y)
+OVERLOAD_2(float, vc4cl_fmaxabs, float, x, float, y)
+OVERLOAD_2(float, vc4cl_fminabs, float, x, float, y)
+OVERLOAD_1(int, vc4cl_ftoi, float, val)
+OVERLOAD_1(float, vc4cl_itof, int, val)
+
+OVERLOAD_2(int, vc4cl_asr, uint, val, int, offset)
+OVERLOAD_2(int, vc4cl_asr, int, val, int, offset)
+OVERLOAD_2(uint, vc4cl_ror, uint, val, int, offset)
+OVERLOAD_2(int, vc4cl_ror, int, val, int, offset)
+OVERLOAD_3_SCALAR(int, vc4cl_min, int, x, int, y, uchar, sign)
+OVERLOAD_3_SCALAR(int, vc4cl_max, int, x, int, y, uchar, sign)
+OVERLOAD_3_SCALAR(long, vc4cl_min, long, x, long, y, uchar, sign)
+OVERLOAD_3_SCALAR(long, vc4cl_max, long, x, long, y, uchar, sign)
+OVERLOAD_2(uint, vc4cl_and, uchar, x, uchar, y)
+OVERLOAD_2(int, vc4cl_and, char, x, char, y)
+OVERLOAD_2(uint, vc4cl_and, ushort, x, ushort, y)
+OVERLOAD_2(int, vc4cl_and, short, x, short, y)
+SIMPLE_2(uint, vc4cl_and, uint, x, uint, y, x & y)
+SIMPLE_2(int, vc4cl_and, int, x, int, y, x & y)
+OVERLOAD_1(uint, vc4cl_clz, uint, val)
+OVERLOAD_1(int, vc4cl_clz, int, val)
+
+OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uchar, x, uchar, y, uchar, sign)
+OVERLOAD_3_SCALAR(int, vc4cl_mul24, char, x, char, y, uchar, sign)
+OVERLOAD_3_SCALAR(uint, vc4cl_mul24, ushort, x, ushort, y, uchar, sign)
+OVERLOAD_3_SCALAR(int, vc4cl_mul24, short, x, short, y, uchar, sign)
+OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uint, x, uint, y, uchar, sign)
+OVERLOAD_3_SCALAR(int, vc4cl_mul24, int, x, int, y, uchar, sign)
+
+OVERLOAD_2(uchar, vc4cl_v8adds, uchar, x, uchar, y)
+OVERLOAD_2(uint, vc4cl_v8adds, uint, x, uint, y)
+OVERLOAD_2(uchar, vc4cl_v8subs, uchar, x, uchar, y)
+OVERLOAD_2(uint, vc4cl_v8subs, uint, x, uint, y)
+OVERLOAD_2(uchar, vc4cl_v8min, uchar, x, uchar, y)
+OVERLOAD_2(uint, vc4cl_v8min, uint, x, uint, y)
+OVERLOAD_2(uchar, vc4cl_v8max, uchar, x, uchar, y)
+OVERLOAD_2(uint, vc4cl_v8max, uint, x, uint, y)
+
+/*
+ * Pack/unpack modes
+ */
+//TODO ALU needs to consume float for this to work
+//unpacks half to float (UNPACK 1: 16a -> 32)
+//OVERLOAD_1(float, vc4cl_unpack_half, half, val)
+//sign-extends short to int (UNPACK 1: 16a -> 32)
+OVERLOAD_1(int, vc4cl_unpack_sext, short, val)
+//unpacks first byte [0, 1] to float (UNPACK 4: 8a -> 32)
+OVERLOAD_1(float, vc4cl_unpack_color_byte0, uchar, val)
+//unpacks second byte [0, 1] to float (UNPACK 5: 8b -> 32)
+OVERLOAD_1(float, vc4cl_unpack_color_byte1, uchar, val)
+//unpacks third byte [0, 1] to float (UNPACK 6: 8c -> 32)
+OVERLOAD_1(float, vc4cl_unpack_color_byte2, uchar, val)
+//unpacks fourth byte [0, 1] to float (UNPACK 7: 8d -> 32)
+OVERLOAD_1(float, vc4cl_unpack_color_byte3, uchar, val)
+//zero-extend first byte to uint (UNPACK 4: 8a -> 32)
+OVERLOAD_1(uint, vc4cl_unpack_byte0, uchar, val)
+//zero-extend second byte to uint (UNPACK 5: 8b -> 32)
+OVERLOAD_1(uint, vc4cl_unpack_byte1, uchar, val)
+//zero-extend third byte to uint (UNPACK 6: 8c -> 32)
+OVERLOAD_1(uint, vc4cl_unpack_byte2, uchar, val)
+//zero-extend fourth byte to uint (UNPACK 7: 8d -> 32)
+OVERLOAD_1(uint, vc4cl_unpack_byte3, uchar, val)
+
+//TODO ALU needs to consume float for this to work
+//packs float into half (PACK 1: 32 -> 16a)
+//OVERLOAD_1(half, vc4cl_pack_half, float, val)
+//converts to unsigned 16-bit integer, truncates the result (PACK 1: 32 -> 16a)
+OVERLOAD_1(ushort, vc4cl_pack_truncate, int, val)
+OVERLOAD_1(ushort, vc4cl_pack_truncate, uint, val)
+//replicates the LSB into all four bytes (PACK 3: 32 -> 8888)
+OVERLOAD_1(uint, vc4cl_replicate_lsb, char, val)
+OVERLOAD_1(uint, vc4cl_replicate_lsb, uchar, val)
+OVERLOAD_1(uint, vc4cl_replicate_lsb, uint, val)
+//takes the LSB and writes it into LSB (PACK 4: 32 -> 8a)
+OVERLOAD_1(uchar, vc4cl_pack_lsb, char, val)
+OVERLOAD_1(uchar, vc4cl_pack_lsb, uchar, val)
+OVERLOAD_1(uchar, vc4cl_pack_lsb, uint, val)
+//calculates addition, but saturates the result afterwards (depending on signed integer over-/underflow of addition) (uses PACK 8: 32 -> 32)
+OVERLOAD_2(int, vc4cl_saturated_add, int, x, int, y)
+//NOTE: Since the 32 -> 32 saturation pack mode works differently for sub, the intrinsic is implemented differently than saturated_add
+OVERLOAD_2(int, vc4cl_saturated_sub, int, x, int, y)
+//saturates to unsigned byte (PACK 12:  32 -> 8a)
+OVERLOAD_1(uchar, vc4cl_saturate_lsb, uint, val)
+
+
+/*
+ * SFU calls
+ */
+OVERLOAD_1(float, vc4cl_sfu_recip, float, val)
+OVERLOAD_1(float, vc4cl_sfu_rsqrt, float, val)
+OVERLOAD_1(float, vc4cl_sfu_log2, float, val)
+OVERLOAD_1(float, vc4cl_sfu_exp2, float, val)
+
+/*
+ * Periphery access
+ */
+void vc4cl_mutex_lock(void);
+void vc4cl_mutex_unlock(void);
+//read DMA without locking the mutex
+OVERLOAD_1(int, vc4cl_dma_read, volatile __global int, * ptr)
+OVERLOAD_1(uint, vc4cl_dma_read, volatile __global uint, * ptr)
+OVERLOAD_1(float, vc4cl_dma_read, volatile __global float, * ptr)
+OVERLOAD_1(int, vc4cl_dma_read, volatile __local int, * ptr)
+OVERLOAD_1(uint, vc4cl_dma_read, volatile __local uint, * ptr)
+OVERLOAD_1(float, vc4cl_dma_read, volatile __local float, * ptr)
+//write DMA without locking the mutex
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global int, * ptr, int, val)
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global uint, * ptr, uint, val)
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global float, * ptr, float, val)
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local int, * ptr, int, val)
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local uint, * ptr, uint, val)
+OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local float, * ptr, float, val)
+//copy DMA without locking the mutex
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uchar, *dest, const __local uchar, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global char, *dest, const __local char, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global ushort, *dest, const __local ushort, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global short, *dest, const __local short, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uint, *dest, const __local uint, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global int, *dest, const __local int, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global float, *dest, const __local float, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uchar, *dest, const __global uchar, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local char, *dest, const __global char, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local ushort, *dest, const __global ushort, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local short, *dest, const __global short, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uint, *dest, const __global uint, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local int, *dest, const __global int, *src, size_t, num_elements)
+OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local float, *dest, const __global float, *src, size_t, num_elements)
+//load into VPM without locking the mutex
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uchar, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global char, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global ushort, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global short, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uint, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global int, *ptr, size_t, num_elements)
+OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global float, *ptr, size_t, num_elements)
+// special handling of 3-element load/store, since LLVM (compliant with the OpenCL standard) by default generates 4-element load/store
+char3 vc4cl_vload3(const __global char* ptr) OVERLOADABLE;
+char3 vc4cl_vload3(const __local char* ptr) OVERLOADABLE;
+char3 vc4cl_vload3(const __private char* ptr) OVERLOADABLE;
+char3 vc4cl_vload3(const __constant char* ptr) OVERLOADABLE;
+uchar3 vc4cl_vload3(const __global uchar* ptr) OVERLOADABLE;
+uchar3 vc4cl_vload3(const __local uchar* ptr) OVERLOADABLE;
+uchar3 vc4cl_vload3(const __private uchar* ptr) OVERLOADABLE;
+uchar3 vc4cl_vload3(const __constant uchar* ptr) OVERLOADABLE;
+short3 vc4cl_vload3(const __global short* ptr) OVERLOADABLE;
+short3 vc4cl_vload3(const __local short* ptr) OVERLOADABLE;
+short3 vc4cl_vload3(const __private short* ptr) OVERLOADABLE;
+short3 vc4cl_vload3(const __constant short* ptr) OVERLOADABLE;
+ushort3 vc4cl_vload3(const __global ushort* ptr) OVERLOADABLE;
+ushort3 vc4cl_vload3(const __local ushort* ptr) OVERLOADABLE;
+ushort3 vc4cl_vload3(const __private ushort* ptr) OVERLOADABLE;
+ushort3 vc4cl_vload3(const __constant ushort* ptr) OVERLOADABLE;
+int3 vc4cl_vload3(const __global int* ptr) OVERLOADABLE;
+int3 vc4cl_vload3(const __local int* ptr) OVERLOADABLE;
+int3 vc4cl_vload3(const __private int* ptr) OVERLOADABLE;
+int3 vc4cl_vload3(const __constant int* ptr) OVERLOADABLE;
+uint3 vc4cl_vload3(const __global uint* ptr) OVERLOADABLE;
+uint3 vc4cl_vload3(const __local uint* ptr) OVERLOADABLE;
+uint3 vc4cl_vload3(const __private uint* ptr) OVERLOADABLE;
+uint3 vc4cl_vload3(const __constant uint* ptr) OVERLOADABLE;
+float3 vc4cl_vload3(const __global float* ptr) OVERLOADABLE;
+float3 vc4cl_vload3(const __local float* ptr) OVERLOADABLE;
+float3 vc4cl_vload3(const __private float* ptr) OVERLOADABLE;
+float3 vc4cl_vload3(const __constant float* ptr) OVERLOADABLE;
+long3 vc4cl_vload3(const __global long* ptr) OVERLOADABLE;
+long3 vc4cl_vload3(const __local long* ptr) OVERLOADABLE;
+long3 vc4cl_vload3(const __private long* ptr) OVERLOADABLE;
+long3 vc4cl_vload3(const __constant long* ptr) OVERLOADABLE;
+ulong3 vc4cl_vload3(const __global ulong* ptr) OVERLOADABLE;
+ulong3 vc4cl_vload3(const __local ulong* ptr) OVERLOADABLE;
+ulong3 vc4cl_vload3(const __private ulong* ptr) OVERLOADABLE;
+ulong3 vc4cl_vload3(const __constant ulong* ptr) OVERLOADABLE;
+
+void vc4cl_vstore3(__global char* ptr, char3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local char* ptr, char3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private char* ptr, char3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global uchar* ptr, uchar3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local uchar* ptr, uchar3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private uchar* ptr, uchar3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global short* ptr, short3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local short* ptr, short3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private short* ptr, short3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global ushort* ptr, ushort3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local ushort* ptr, ushort3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private ushort* ptr, ushort3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global int* ptr, int3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local int* ptr, int3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private int* ptr, int3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global uint* ptr, uint3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local uint* ptr, uint3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private uint* ptr, uint3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global float* ptr, float3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local float* ptr, float3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private float* ptr, float3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global long* ptr, long3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local long* ptr, long3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private long* ptr, long3 val) OVERLOADABLE;
+void vc4cl_vstore3(__global ulong* ptr, ulong3 val) OVERLOADABLE;
+void vc4cl_vstore3(__local ulong* ptr, ulong3 val) OVERLOADABLE;
+void vc4cl_vstore3(__private ulong* ptr, ulong3 val) OVERLOADABLE;
+/*
+ * Work-item functions
+ * Mapped to UNIFORM reads
+ *
+ * local values are stored in the a UNIFORM in this fashion:
+ * | 0 | dim2 | dim1 | dim0 |
+ * -> to read value of dimension x, calculate: (UNIFORM >> (dim * 8)) & 0xFF
+ *
+ * This can be compacted in such way, since for a maximum value of 12, the local ID and size fits into 1 Byte
+ */
+PURE uchar vc4cl_work_dimensions(void);
+PURE uchar vc4cl_local_size(uint dim);
+PURE uchar vc4cl_local_id(uint dim);
+PURE uint vc4cl_num_groups(uint dim);
+PURE uint vc4cl_group_id(uint dim);
+PURE uint vc4cl_global_offset(uint dim);
+PURE uint vc4cl_global_size(uint dim);
+PURE uint vc4cl_global_id(uint dim);
+PURE uchar vc4cl_local_linear_id(void);
+PURE uint vc4cl_global_linear_id(void);
+
+/*
+ * Image functions
+ * In CLang, read_only and write_only image-types are separate types.
+ * Also in CLang, OpenCL image-types are built-in opaque types
+ */
+#ifdef __IMAGE_SUPPORT__
+/*
+ * Texture Config Parameter 0
+ * Broadcom specification, table 15
+ *
+ * 0 - 3   | 4 bits  | Number of mipmap levels minus 1
+ * 4 - 7   | 4 bits  | texture data type (high bit is on config parameter 1)
+ * 8       | 1 bit   | flip texture Y axis
+ * 9       | 1 bit   | cube map mode
+ * 10 - 11 | 2 bits  | cache swizzle
+ * 12 - 31 | 20 bits | texture base pointer (multiple of 4KB)
+ */
+OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_basic_setup)
+/*
+ * Texture Config Parameter 1
+ * Broadcom specification, table 16
+ *
+ * 0 - 1   | 2 bits  | S (x-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
+ * 2 - 3   | 2 bits  | T (y-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
+ * 4 - 6   | 3 bits  | minification filter (interpolation)
+ * 7       | 1 bit   | magnification filter
+ * 8 - 18  | 11 bits | image width (0 = 2048)
+ * 19      | 1 bit   | flip ETC Y (per block)
+ * 20 - 30 | 11 bits | image height (0 = 248)
+ * 31      | 1 bit   | high bit of texture type
+ */
+OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_access_setup)
+/*
+ * Texture Config Parameters 2 and 3
+ * Broadcom specification, table 17
+ *
+ * Cube map stride:
+ * 0       | 1 bit   | disable automatic LOD, use bias only
+ * 12 - 29 | 18 bits | cube map stride (in multiples of 4KB)
+ * 30 - 31 | 2 bits  | value 1 for cube map stride
+ *
+ * Child image dimensions:
+ * 0 - 10  | 11 bits | child image width (0 = 2048, does not work, see errata HW-2753)
+ * 12 - 22 | 11 bits | child image height (0 = 2048, does not work, see errata HW-2753)
+ * 30 - 31 | 2 bits  | value 2 for child image dimensions
+ *
+ * Child image offsets:
+ * 0 - 10  | 11 bits | child image X offset
+ * 12 - 22 | 11 bits | child image Y offset
+ * 30 - 31 | 2 bits  | value 3 for child image offsets
+ */
+OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_extended_setup)
+/*
+ * To apply a sampler to an image, we need to override the image-access setup UNIFORM before a read with the magnification/minification filters and wrap modes to use
+ */
+OVERLOAD_ALL_IMAGE_TYPES_1(void, vc4cl_set_image_access_setup, uint, val)
+CONST uint vc4cl_sampler_get_normalized_coords(sampler_t sampler);
+CONST uint vc4cl_sampler_get_addressing_mode(sampler_t sampler);
+CONST uint vc4cl_sampler_get_filter_mode(sampler_t sampler);
+/*
+ * Image read functions
+ *
+ * The coordinates need to be floating-values in the range [0, 1] and are scaled to the width/height of the image.
+ * The returned data is not necessarily <4 x int32>, but up to 4 components with up to 32 bits each, loaded according to the byte-sizes and number of components specified in the channel_type_size and channel_order_size.
+ *
+ * So, this functions return the data in the native format (as stored in the image-buffer), but correctly distributed across the 4 components.
+ */
+int4 vc4cl_image_read(read_only image1d_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+int4 vc4cl_image_read(read_only image1d_buffer_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+int4 vc4cl_image_read(read_only image1d_array_t image, float coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+int4 vc4cl_image_read(read_only image2d_t image, float2 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+int4 vc4cl_image_read(read_only image2d_array_t image, float2 coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+int4 vc4cl_image_read(read_only image3d_t image, float4 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
+#endif
+
+/*
+ * Type conversions
+ */
+// TODO use __builtin_convertvector ?? https://clang.llvm.org/docs/LanguageExtensions.html#builtin-convertvector
+// check available on all compiler versions, generated LLVM IR code!
+//component-wise bitcasts
+OVERLOAD_1(uchar, vc4cl_bitcast_uchar, uint, val)
+OVERLOAD_1(uchar, vc4cl_bitcast_uchar, int, val)
+OVERLOAD_1(char, vc4cl_bitcast_char, uint, val)
+OVERLOAD_1(char, vc4cl_bitcast_char, int, val)
+OVERLOAD_1(ushort, vc4cl_bitcast_ushort, uint, val)
+OVERLOAD_1(ushort, vc4cl_bitcast_ushort, int, val)
+OVERLOAD_1(short, vc4cl_bitcast_short, uint, val)
+OVERLOAD_1(short, vc4cl_bitcast_short, int, val)
+SIMPLE_1(uint, vc4cl_bitcast_uint, uint, val, val)
+OVERLOAD_1(uint, vc4cl_bitcast_uint, int, val)
+OVERLOAD_1(int, vc4cl_bitcast_int, uint, val)
+SIMPLE_1(int, vc4cl_bitcast_int, int, val, val)
+
+OVERLOAD_1(uint, vc4cl_bitcast_uint, float, val)
+OVERLOAD_1(float, vc4cl_bitcast_float, uint, val)
+OVERLOAD_1(int, vc4cl_bitcast_int, float, val)
+OVERLOAD_1(float, vc4cl_bitcast_float, int, val)
+
+SIMPLE_1(int, vc4cl_sign_extend, char, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFF) << 24, 24))
+//SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFFFF) << 16, 16))
+SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_unpack_sext(val))
+
+SIMPLE_1(uint, vc4cl_zero_extend, uchar, val, vc4cl_and(val, (arg_t) (0xFFU)))
+SIMPLE_1(uint, vc4cl_zero_extend, ushort, val, vc4cl_and(val, (arg_t) (0xFFFFU)))
+
+SIMPLE_1(uint, vc4cl_extend, uchar, val, vc4cl_zero_extend(val))
+SIMPLE_1(int, vc4cl_extend, char, val, vc4cl_sign_extend(val))
+SIMPLE_1(uint, vc4cl_extend, ushort, val, vc4cl_zero_extend(val))
+SIMPLE_1(int, vc4cl_extend, short, val, vc4cl_sign_extend(val))
+SIMPLE_1(uint, vc4cl_extend, uint, val, val)
+SIMPLE_1(int, vc4cl_extend, int, val, val)
+SIMPLE_1(ulong, vc4cl_extend, ulong, val, val)
+SIMPLE_1(long, vc4cl_extend, long, val, val)
+
+OVERLOAD_1(ulong, vc4cl_bitcast_ulong, long, val)
+OVERLOAD_1(ulong, vc4cl_bitcast_ulong, ulong, val)
+OVERLOAD_1(long, vc4cl_bitcast_long, ulong, val)
+OVERLOAD_1(long, vc4cl_bitcast_long, long, val)
+OVERLOAD_1(uint, vc4cl_long_to_int, ulong, val)
+OVERLOAD_1(int, vc4cl_long_to_int, long, val)
+OVERLOAD_1(ulong, vc4cl_int_to_ulong, uint, val)
+OVERLOAD_1(long, vc4cl_int_to_long, int, val)
+SIMPLE_1(ulong, vc4cl_extend_to_long, uint, val, vc4cl_int_to_ulong(val))
+SIMPLE_1(long, vc4cl_extend_to_long, int, val, vc4cl_int_to_long(val))
+OVERLOAD_2_SCALAR(int, vc4cl_long_to_int_sat, long, val, uchar, sign)
+OVERLOAD_2_SCALAR(uint, vc4cl_long_to_int_sat, ulong, val, uchar, sign)
+OVERLOAD_1(float, vc4cl_long_to_float, long, val)
+OVERLOAD_1(float, vc4cl_ulong_to_float, ulong, val)
+
+/*
+ * Other functions
+ */
+SIMPLE_1(uchar, vc4cl_msb_set, uchar, val, vc4cl_bitcast_uchar(vc4cl_extend(val >> 7 == (arg_t)1)))
+SIMPLE_1(char, vc4cl_msb_set, char, val, vc4cl_bitcast_char(vc4cl_and((arg_t)(val >> 7), (arg_t)1)) == (arg_t)1)
+SIMPLE_1(ushort, vc4cl_msb_set, ushort, val, vc4cl_bitcast_ushort(vc4cl_extend(val >> 15 == (arg_t)1)))
+SIMPLE_1(short, vc4cl_msb_set, short, val, vc4cl_bitcast_short(vc4cl_and((arg_t)(val >> 15), (arg_t)1)) == (arg_t)1)
+SIMPLE_1(uint, vc4cl_msb_set, uint, val, vc4cl_bitcast_uint(val >> 31 == 1))
+SIMPLE_1(int, vc4cl_msb_set, int, val, (val < (arg_t)0))
+SIMPLE_1(long, vc4cl_msb_set, ulong, val, (val >> 63 == 1))
+SIMPLE_1(long, vc4cl_msb_set, long, val, (val < (arg_t)0))
+
+OVERLOAD_1(int, vc4cl_is_nan, float, val)
+OVERLOAD_1(int, vc4cl_is_inf_nan, float, val)
+OVERLOAD_1(int, vc4cl_is_zero, float, val)
+
+OVERLOAD_3_SCALAR(int, vc4cl_mul_hi, int, x, int, y, uchar, sign)
+OVERLOAD_3_SCALAR(uint, vc4cl_mul_hi, uint, x, uint, y, uchar, sign)
+OVERLOAD_3_SCALAR(long, vc4cl_mul_full, int, x, int, y, uchar, sign)
+OVERLOAD_3_SCALAR(ulong, vc4cl_mul_full, uint, x, uint, y, uchar, sign)
+
+OVERLOAD_1(uchar, vc4cl_popcount, uchar, val)
+OVERLOAD_1(char, vc4cl_popcount, char, val)
+OVERLOAD_1(ushort, vc4cl_popcount, ushort, val)
+OVERLOAD_1(short, vc4cl_popcount, short, val)
+OVERLOAD_1(uint, vc4cl_popcount, uint, val)
+OVERLOAD_1(int, vc4cl_popcount, int, val)
+OVERLOAD_1(ulong, vc4cl_popcount, ulong, val)
+OVERLOAD_1(long, vc4cl_popcount, long, val)
+
+event_t vc4cl_set_event(event_t ev) CONST;
+
+void vc4cl_barrier(cl_mem_fence_flags);
+
+/*
+ * Vector functions
+ */
+//Rotates the vector-elements according to the offset (-15 .. +15)
+//an offset of 5 means rotate up 5 positions (e.g. x.s0 -> y.s5, x.s10 -> y.15, x.s12 -> y.s1
+//NOTE: the rotation is always all 16 elements!! So functions with vector-size of less than 16 MUST not use the positions shifted in from the remaining vector-elements
+OVERLOAD_2_SCALAR(uchar, vc4cl_vector_rotate, uchar, val, char, offset)
+OVERLOAD_2_SCALAR(char, vc4cl_vector_rotate, char, val, char, offset)
+OVERLOAD_2_SCALAR(ushort, vc4cl_vector_rotate, ushort, val, char, offset)
+OVERLOAD_2_SCALAR(short, vc4cl_vector_rotate, short, val, char, offset)
+OVERLOAD_2_SCALAR(uint, vc4cl_vector_rotate, uint, val, char, offset)
+OVERLOAD_2_SCALAR(int, vc4cl_vector_rotate, int, val, char, offset)
+OVERLOAD_2_SCALAR(ulong, vc4cl_vector_rotate, ulong, val, char, offset)
+OVERLOAD_2_SCALAR(long, vc4cl_vector_rotate, long, val, char, offset)
+OVERLOAD_2_SCALAR(float, vc4cl_vector_rotate, float, val, char, offset)
+
+/*
+ * For debugging purposes
+ */
+//The vector element number (0 .. 15)
+CONST uchar16 vc4cl_element_number(void);
+//the ID of the QPU (the processor)
+CONST uchar vc4cl_qpu_number(void);
+
+#endif /* VC4CL_INTRINSICS_H */
+
--- a/drivers/videocore4_stdlib/include/_math.h
+++ b/drivers/videocore4_stdlib/include/_math.h
--- a/drivers/videocore4_stdlib/include/_overloads.h
+++ b/drivers/videocore4_stdlib/include/_overloads.h
@ -0,0 +1,819 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_OVERLOADS_H
+#define VC4CL_OVERLOADS_H
+
+#include "_config.h"
+
+#ifndef OVERLOADABLE
+#define OVERLOADABLE  __attribute__((overloadable))
+#endif
+/*
+ * "__attribute__((const)) function attribute
+ *  Many functions examine only the arguments passed to them, and have no effects except for the return value.
+ *  This is a much stricter class than __attribute__((pure)), because a function is not permitted to read global memory.
+ *  If a function is known to operate only on its arguments then it can be subject to common sub-expression elimination and loop optimizations."
+ *
+ * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/Cacgigch.html
+ */
+#ifndef CONST
+#define CONST __attribute__((const))    //tells the compiler, that result won't change
+#endif
+/*
+ * "__attribute__((pure)) function attribute
+ *  Many functions have no effects except to return a value, and their return value depends only on the parameters and global variables.
+ *  Functions of this kind can be subject to data flow analysis and might be eliminated."
+ *
+ *  http://infocenter.arm.com/help/topic/com.arm.doc.dui0491c/Cacigdac.html
+ */
+#define PURE __attribute__((pure))
+#define INLINE __attribute__((always_inline)) __attribute__((flatten)) inline   //flatten inlines all call within this function
+#define FUNC_1(ret, func, argType, argName) ret func(argType argName) OVERLOADABLE
+#ifndef OVERLOAD_1
+#define OVERLOAD_1(ret, func, argType, argName) \
+	FUNC_1(ret##16, func, argType##16, argName); \
+	FUNC_1(ret##8, func, argType##8, argName); \
+	FUNC_1(ret##4, func, argType##4, argName); \
+	FUNC_1(ret##3, func, argType##3, argName); \
+	FUNC_1(ret##2, func, argType##2, argName); \
+	FUNC_1(ret, func, argType, argName);
+#endif
+
+#ifndef OVERLOAD_1_RETURN_SCALAR
+#define OVERLOAD_1_RETURN_SCALAR(ret, func, argType, argName) \
+	FUNC_1(ret, func, argType##16, argName); \
+	FUNC_1(ret, func, argType##8, argName); \
+	FUNC_1(ret, func, argType##4, argName); \
+	FUNC_1(ret, func, argType##3, argName); \
+	FUNC_1(ret, func, argType##2, argName); \
+	FUNC_1(ret, func, argType, argName);
+#endif
+
+#define FUNC_2(ret, func, argType0, argName0, argType1, argName1) ret func(argType0 argName0, argType1 argName1) OVERLOADABLE
+#ifndef OVERLOAD_2
+#define OVERLOAD_2(ret, func, argType0, argName0, argType1, argName1) \
+	FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1); \
+	FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1); \
+	FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1); \
+	FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1); \
+	FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1); \
+	FUNC_2(ret, func, argType0, argName0, argType1, argName1);
+#endif
+
+#ifndef OVERLOAD_2_SCALAR
+#define OVERLOAD_2_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
+	FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1); \
+	FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1); \
+	FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1); \
+	FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1); \
+	FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0, argName0, argType1, argName1);
+#endif
+
+#ifndef OVERLOAD_2_RETURN_SCALAR
+#define OVERLOAD_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
+	FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1); \
+	FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1); \
+	FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1); \
+	FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1); \
+	FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1); \
+	FUNC_2(ret, func, argType0, argName0, argType1, argName1);
+#endif
+
+#ifndef OVERLOAD_2_SCALAR_RETURN_SCALAR
+#define OVERLOAD_2_SCALAR_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
+	FUNC_2(ret, func, argType0##16, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0##8, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0##4, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0##3, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0##2, argName0, argType1, argName1); \
+	FUNC_2(ret, func, argType0, argName0, argType1, argName1);
+#endif
+
+#define FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) ret func(argType0 argName0, argType1 argName1, argType2 argName2) OVERLOADABLE
+#ifndef OVERLOAD_3
+#define OVERLOAD_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
+	FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2); \
+	FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2); \
+	FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2); \
+	FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2); \
+	FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2); \
+	inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
+#endif
+
+#ifndef OVERLOAD_3_SCALAR
+#define OVERLOAD_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
+	FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2); \
+	FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2); \
+	FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2); \
+	FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2); \
+	FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2); \
+	inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
+#endif
+
+#define FUNC_4(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3) OVERLOADABLE
+
+#define FUNC_5(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3, arg4Type, arg4Name) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3, arg4Type arg4Name) OVERLOADABLE
+
+#ifndef SIMPLE_1
+#define SIMPLE_1(ret, func, argType, argName, content) \
+	INLINE FUNC_1(ret##16, func, argType##16, argName) \
+	{ \
+		typedef argType##16 arg_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret##8, func, argType##8, argName) \
+	{ \
+		typedef argType##8 arg_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret##4, func, argType##4, argName) \
+	{ \
+		typedef argType##4 arg_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret##3, func, argType##3, argName) \
+	{ \
+		typedef argType##3 arg_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret##2, func, argType##2, argName) \
+	{ \
+		typedef argType##2 arg_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType, argName) \
+	{ \
+		typedef argType arg_t;\
+		typedef ret result_t;\
+		return content; \
+	}
+#endif
+
+#ifndef SIMPLE_1_RETURN_SCALAR
+#define SIMPLE_1_RETURN_SCALAR(ret, func, argType, argName, content) \
+	INLINE FUNC_1(ret, func, argType##16, argName) \
+	{ \
+		typedef argType##16 arg_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType##8, argName) \
+	{ \
+		typedef argType##8 arg_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType##4, argName) \
+	{ \
+		typedef argType##4 arg_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType##3, argName) \
+	{ \
+		typedef argType##3 arg_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType##2, argName) \
+	{ \
+		typedef argType##2 arg_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_1(ret, func, argType, argName) \
+	{ \
+		typedef argType arg_t;\
+		typedef ret result_t;\
+		return content; \
+	}
+#endif
+
+#ifndef SIMPLE_2
+#define SIMPLE_2(ret, func, argType0, argName0, argType1, argName1, content) \
+	INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	}
+#endif
+
+#ifndef SIMPLE_2_RETURN_SCALAR
+#define SIMPLE_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
+	INLINE FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret result_t;\
+		return content; \
+	}
+#endif
+
+#ifndef SIMPLE_2_SCALAR
+#define SIMPLE_2_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
+	INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	//scalar part is skipped, since it is too often already defined for e.g. a version taking two vectors
+#endif
+
+#ifndef SIMPLE_3
+#define SIMPLE_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
+	INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef argType2##16 arg2_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef argType2##8 arg2_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef argType2##4 arg2_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef argType2##3 arg2_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef argType2##2 arg2_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret result_t;\
+		return content; \
+	}
+#endif
+
+#ifndef SIMPLE_3_SCALAR
+#define SIMPLE_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
+	INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
+#endif
+
+#ifndef SIMPLE_3_TWO_SCALAR
+#define SIMPLE_3_TWO_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
+	INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##16 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##8 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##4 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##3 result_t;\
+		return content; \
+	} \
+	INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##2 result_t;\
+		return content; \
+	} \
+	//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
+#endif
+
+#ifndef COMPLEX_1
+#define COMPLEX_1(ret, func, argType, argName, content) \
+	INLINE FUNC_1(ret##16, func, argType##16, argName) \
+	{ \
+		typedef argType##16 arg_t;\
+		typedef ret##16 result_t;\
+		typedef int##16 int_t; \
+		typedef float##16 float_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret##8, func, argType##8, argName) \
+	{ \
+		typedef argType##8 arg_t;\
+		typedef ret##8 result_t;\
+		typedef int##8 int_t; \
+		typedef float##8 float_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret##4, func, argType##4, argName) \
+	{ \
+		typedef argType##4 arg_t;\
+		typedef ret##4 result_t;\
+		typedef int##4 int_t; \
+		typedef float##4 float_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret##3, func, argType##3, argName) \
+	{ \
+		typedef argType##3 arg_t;\
+		typedef ret##3 result_t;\
+		typedef int##3 int_t; \
+		typedef float##3 float_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret##2, func, argType##2, argName) \
+	{ \
+		typedef argType##2 arg_t;\
+		typedef ret##2 result_t;\
+		typedef int##2 int_t; \
+		typedef float##2 float_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType, argName) \
+	{ \
+		typedef argType arg_t;\
+		typedef ret result_t;\
+		typedef int int_t; \
+		typedef float float_t; \
+		content \
+	}
+#endif
+
+#ifndef COMPLEX_1_RETURN_SCALAR
+#define COMPLEX_1_RETURN_SCALAR(ret, func, argType, argName, content) \
+	INLINE FUNC_1(ret, func, argType##16, argName) \
+	{ \
+		typedef argType##16 arg_t;\
+		typedef int##16 int_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType##8, argName) \
+	{ \
+		typedef argType##8 arg_t;\
+		typedef int##8 int_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType##4, argName) \
+	{ \
+		typedef argType##4 arg_t;\
+		typedef int##4 int_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType##3, argName) \
+	{ \
+		typedef argType##3 arg_t;\
+		typedef int##3 int_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType##2, argName) \
+	{ \
+		typedef argType##2 arg_t;\
+		typedef int##2 int_t; \
+		content \
+	} \
+	INLINE FUNC_1(ret, func, argType, argName) \
+	{ \
+		typedef argType arg_t;\
+		typedef int int_t; \
+		content \
+	}
+#endif
+
+#ifndef COMPLEX_2
+#define COMPLEX_2(ret, func, argType0, argName0, argType1, argName1, content) \
+	INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef ret##16 result_t;\
+		typedef int##16 int_t; \
+		typedef uint##16 uint_t; \
+		typedef float##16 float_t; \
+		content \
+	} \
+	INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef ret##8 result_t;\
+		typedef int##8 int_t; \
+		typedef uint##8 uint_t; \
+		typedef float##8 float_t; \
+		content \
+	} \
+	INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef ret##4 result_t;\
+		typedef int##4 int_t; \
+		typedef uint##4 uint_t; \
+		typedef float##4 float_t; \
+		content \
+	} \
+	INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef ret##3 result_t;\
+		typedef int##3 int_t; \
+		typedef uint##3 uint_t; \
+		typedef float##3 float_t; \
+		content \
+	} \
+	INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef ret##2 result_t;\
+		typedef int##2 int_t; \
+		typedef uint##2 uint_t; \
+		typedef float##2 float_t; \
+		content \
+	} \
+	INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef ret result_t;\
+		typedef int int_t; \
+		typedef uint uint_t; \
+		typedef float float_t; \
+		content \
+	}
+#endif
+
+#ifndef COMPLEX_3
+#define COMPLEX_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
+	INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef argType2##16 arg2_t;\
+		typedef ret##16 result_t;\
+		typedef int##16 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef argType2##8 arg2_t;\
+		typedef ret##8 result_t;\
+		typedef int##8 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef argType2##4 arg2_t;\
+		typedef ret##4 result_t;\
+		typedef int##4 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef argType2##3 arg2_t;\
+		typedef ret##3 result_t;\
+		typedef int##3 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef argType2##2 arg2_t;\
+		typedef ret##2 result_t;\
+		typedef int##2 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret result_t;\
+		typedef int int_t; \
+		content \
+	}
+#endif
+
+#ifndef COMPLEX_3_SCALAR
+#define COMPLEX_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
+	INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##16 arg0_t;\
+		typedef argType1##16 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##16 result_t;\
+		typedef int##16 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##8 arg0_t;\
+		typedef argType1##8 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##8 result_t;\
+		typedef int##8 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##4 arg0_t;\
+		typedef argType1##4 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##4 result_t;\
+		typedef int##4 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##3 arg0_t;\
+		typedef argType1##3 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##3 result_t;\
+		typedef int##3 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
+	{ \
+		typedef argType0##2 arg0_t;\
+		typedef argType1##2 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret##2 result_t;\
+		typedef int##2 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
+	{ \
+		typedef argType0 arg0_t;\
+		typedef argType1 arg1_t;\
+		typedef argType2 arg2_t;\
+		typedef ret result_t;\
+		typedef int int_t; \
+		content \
+	}
+#endif
+
+#define OVERLOAD_ALL_IMAGE_TYPES(ret, func) \
+	CONST FUNC_1(ret, func, read_only image1d_t, image); \
+	CONST FUNC_1(ret, func, write_only image1d_t, image); \
+	CONST FUNC_1(ret, func, read_only image2d_t, image); \
+	CONST FUNC_1(ret, func, write_only image2d_t, image); \
+	CONST FUNC_1(ret, func, read_only image3d_t, image); \
+	/* XXX CONST FUNC_1(ret, func, write_only image3d_t, image); */ \
+	CONST FUNC_1(ret, func, read_only image1d_buffer_t, image); \
+	CONST FUNC_1(ret, func, write_only image1d_buffer_t, image); \
+	CONST FUNC_1(ret, func, read_only image1d_array_t, image); \
+	CONST FUNC_1(ret, func, write_only image1d_array_t, image); \
+	CONST FUNC_1(ret, func, read_only image2d_array_t, image); \
+	CONST FUNC_1(ret, func, write_only image2d_array_t, image);
+
+#define OVERLOAD_ALL_IMAGE_TYPES_1(ret, func, argType, argName) \
+	FUNC_2(ret, func, read_only image1d_t, image, argType, argName); \
+	FUNC_2(ret, func, write_only image1d_t, image, argType, argName); \
+	FUNC_2(ret, func, read_only image2d_t, image, argType, argName); \
+	FUNC_2(ret, func, write_only image2d_t, image, argType, argName); \
+	FUNC_2(ret, func, read_only image3d_t, image, argType, argName); \
+	/* XXX FUNC_2(ret, func, write_only image3d_t, image, argType, argName); */ \
+	FUNC_2(ret, func, read_only image1d_buffer_t, image, argType, argName); \
+	FUNC_2(ret, func, write_only image1d_buffer_t, image, argType, argName); \
+	FUNC_2(ret, func, read_only image1d_array_t, image, argType, argName); \
+	FUNC_2(ret, func, write_only image1d_array_t, image, argType, argName); \
+	FUNC_2(ret, func, read_only image2d_array_t, image, argType, argName); \
+	FUNC_2(ret, func, write_only image2d_array_t, image, argType, argName);
+
+#define OVERLOAD_ALL_IMAGE_TYPES_2(ret, func, arg0Type, arg0Name, arg1Type, arg1Name) \
+	FUNC_3(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	/* XXX FUNC_3(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); */ \
+	FUNC_3(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
+	FUNC_3(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name);
+
+#define OVERLOAD_ALL_IMAGE_TYPES_3(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name) \
+	FUNC_4(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	/* XXX FUNC_4(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); */ \
+	FUNC_4(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
+	FUNC_4(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name);
+
+#define OVERLOAD_ALL_IMAGE_TYPES_4(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name) \
+	FUNC_5(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	/* XXX FUNC_5(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); */ \
+	FUNC_5(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
+	FUNC_5(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name);
+
+#endif /* VC4CL_OVERLOADS_H */
+
--- a/drivers/videocore4_stdlib/include/_printf.h
+++ b/drivers/videocore4_stdlib/include/_printf.h
@ -0,0 +1,43 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_PRINTF
+#define VC4CL_PRINTF
+
+#include "_config.h"
+
+//void* vc4cl_get_param(uint);
+//void vc4cl_print_char(char);
+//
+//INLINE int printf(__constant const char * restrict format, ...)
+//{
+//	__constant const char* formatPtr = format;
+//	uint paramIndex = 1;
+//	while(*format != '\0')
+//	{
+//		if(*format == '%')
+//		{
+//			++formatPtr;
+//			switch(*formatPtr)
+//			{
+//				case '%':
+//					vc4cl_print_char('%');
+//					break;
+//				case 'c':
+//					vc4cl_print_char(*vc4cl_get_param(paramIndex));
+//				case 's':
+//
+//			}
+//		}
+//		else
+//			vc4cl_print_char(*formatPtr);
+//		++formatPtr;
+//	}
+//	//TODO
+//	return -1;
+//}
+
+#endif /* VC4CL_PRINTF */
--- a/drivers/videocore4_stdlib/include/_relational.h
+++ b/drivers/videocore4_stdlib/include/_relational.h
@ -0,0 +1,341 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_RELATIONAL_H
+#define VC4CL_RELATIONAL_H
+
+#include "_config.h"
+#include "_overloads.h"
+#include "_intrinsics.h"
+
+#ifndef COMPARISON_1
+#define COMPARISON_1(func, content) \
+	INLINE FUNC_1(int##16, func, float##16, val) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_1(int##8, func, float##8, val) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_1(int##4, func, float##4, val) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_1(int##3, func, float##3, val) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_1(int##2, func, float##2, val) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_1(int, func, float, val) CONST \
+	{ /* 1 instead of -1 here on purpose! */ \
+		return (content) ? 1 : 0; \
+	}
+#endif
+
+#ifndef COMPARISON_2
+#define COMPARISON_2(func, content) \
+	INLINE FUNC_2(int##16, func, float##16, x, float##16, y) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_2(int##8, func, float##8, x, float##8, y) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_2(int##4, func, float##4, x, float##4, y) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_2(int##3, func, float##3, x, float##3, y) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_2(int##2, func, float##2, x, float##2, y) CONST \
+	{ \
+		return (content) ? -1 : 0; \
+	} \
+	INLINE FUNC_2(int, func, float, x, float, y) CONST \
+	{ /* 1 instead of -1 here on purpose! */ \
+		return (content) ? 1 : 0; \
+	}
+#endif
+
+#ifndef FOR_ALL_ELEMENTS
+#define FOR_ALL_ELEMENTS(func, type, op, conv) \
+	INLINE FUNC_1(int, func, type##16, x) CONST \
+	{ \
+		/* (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf) */ \
+		type##16 val0 = conv(x); \
+		/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op s8, s8 op s9, s9 op sa, sa op sb, sb op sc, sc op sd, sd op se, se op sf, sf op s0) */ \
+		val0 = val0 op vc4cl_vector_rotate(val0, -1); \
+		/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op s8, s6 op s7 op s8 op s9, s7 op s8 op s9 op sa, s8 op s9 op sa op sb, s9 op sa op sb op sc, sa op sb op sc op sd, sb op sc op sd op se, sc op sd op se op sf, ...) */ \
+		const type##16 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
+		/* (s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7, ..., s8 op s9 op sa op ab op sc op sd op se op sf, ...) */ \
+		const type##16 val2 = val1 op vc4cl_vector_rotate(val1, -4); \
+		/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 op s8 op s9 op sa op ab op sc op sd op se op sf */ \
+		return (val2 op val1 op vc4cl_vector_rotate(val2, -8)).x != 0; \
+	} \
+	INLINE FUNC_1(int, func, type##8, x) CONST \
+	{ \
+		/* (s0, s1, s2, s3, s4, s5, s6, s7) */ \
+		type##8 val0 = conv(x); \
+		/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op ?) */ \
+		val0 = val0 op vc4cl_vector_rotate(val0, -1); \
+		/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op ?, s6 op s7 op ? op ?, s7 op ? op ? op ?) */ \
+		const type##8 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
+		/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 */ \
+		return (val1 op vc4cl_vector_rotate(val1, -4)).x != 0; \
+	} \
+	INLINE FUNC_1(int, func, type##4, x) CONST \
+	{ \
+		/* (x, y, z, w) */ \
+		type##4 val0 = conv(x); \
+		/* (x op y, y op z, z op w, w op ?) */ \
+		val0 = val0 op vc4cl_vector_rotate(val0, -1); \
+		/* (z op w, w op ?, ? op ?, ? op ?) */ \
+		const type##4 val1 = vc4cl_vector_rotate(val0, -2); \
+		/* (x op y op z op w, ...) */ \
+		return (val0 op val1).x != 0; \
+	} \
+	INLINE FUNC_1(int, func, type##3, x) CONST \
+	{ \
+		type##3 val = conv(x); \
+		return (val.x op val.y op val.z) != 0; \
+	} \
+	INLINE FUNC_1(int, func, type##2, x) CONST \
+	{ \
+		type##2 val = conv(x); \
+		return (val.x op val.y) != 0; \
+	} \
+	INLINE FUNC_1(int, func, type, x) CONST \
+	{ \
+		type val = conv(x); \
+		return val != 0; \
+	}
+#endif
+
+#ifndef SELECT_SCALAR
+#define SELECT_SCALAR(type, maskType, content) \
+	INLINE FUNC_3(type, select, type, a, type, b, maskType, c) CONST \
+	{ \
+		return content; \
+	}
+#endif
+
+#ifndef SELECT_VECTOR
+#define SELECT_VECTOR(type, maskType, content) \
+	INLINE FUNC_3(type##2, select, type##2, a, type##2, b, maskType##2, c) CONST \
+	{ \
+		typedef int##2 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(type##3, select, type##3, a, type##3, b, maskType##3, c) CONST \
+	{ \
+		typedef int##3 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(type##4, select, type##4, a, type##4, b, maskType##4, c) CONST \
+	{ \
+		typedef int##4 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(type##8, select, type##8, a, type##8, b, maskType##8, c) CONST \
+	{ \
+		typedef int##8 int_t; \
+		content \
+	} \
+	INLINE FUNC_3(type##16, select, type##16, a, type##16, b, maskType##16, c) CONST \
+	{ \
+		typedef int##16 int_t; \
+		content \
+	}
+#endif
+
+/*
+ * The checks for NaNs as defined in the specification are done in the intrinsic of the comparison operators:
+ *
+ * "The relational functions isequal, isgreater, isgreaterequal, isless, islessequal, and islessgreater
+ * always return 0 if either argument is not a number (NaN). isnotequal returns 1 if one or both
+ * arguments are not a number (NaN) and the argument type is a scalar [...]"
+ * - OpenCL 1.2, section 6.12.6 Relational Functions
+ */
+COMPARISON_2(isequal, x == y)
+COMPARISON_2(isnotequal, x != y)
+COMPARISON_2(isgreater, x > y)
+COMPARISON_2(isgreaterequal, x >= y)
+COMPARISON_2(isless, x < y)
+COMPARISON_2(islessequal, x <= y)
+COMPARISON_2(islessgreater, (x < y) || (x > y))
+
+// From <cmath>: "A finite value is any floating-point value that is neither infinite nor NaN (Not-A-Number)."
+COMPARISON_1(isfinite, !vc4cl_is_inf_nan(val))
+COMPARISON_1(isinf, (vc4cl_bitcast_uint(val) & NAN) == INF)
+COMPARISON_1(isnan, vc4cl_is_nan(val))
+// From <cmath>: "Returns whether x is a normal value: i.e., whether it is neither infinity, NaN, zero or subnormal."
+COMPARISON_1(isnormal, !isinf(val) && !isnan(val) && ((vc4cl_bitcast_uint(val) & 0x7F800000) != 0) /* neither zero nor denormal */)
+COMPARISON_2(isordered, isequal(x, x) && isequal(y, y))
+COMPARISON_2(isunordered, isnan(x) || isnan(y))
+
+// for vector,directly use asr, for scalar shr. This is way more efficient than everything else (1 instruction)
+INLINE FUNC_1(int16, signbit, float16, val) CONST
+{
+       return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
+}
+INLINE FUNC_1(int8, signbit, float8, val) CONST
+{
+       return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
+}
+INLINE FUNC_1(int4, signbit, float4, val) CONST
+{
+       return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
+}
+INLINE FUNC_1(int3, signbit, float3, val) CONST
+{
+       return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
+}
+INLINE FUNC_1(int2, signbit, float2, val) CONST
+{
+       return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
+}
+INLINE FUNC_1(int, signbit, float, val) CONST
+{
+       return vc4cl_bitcast_uint(val) >> 31;
+}
+
+FOR_ALL_ELEMENTS(any, char, |, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(any, short, |, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(any, int, |, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(any, long, |, vc4cl_msb_set)
+
+FOR_ALL_ELEMENTS(all, char, &, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(all, short, &, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(all, int, &, vc4cl_msb_set)
+FOR_ALL_ELEMENTS(all, long, &, vc4cl_msb_set)
+
+
+//"Each bit of the result is the corresponding bit of a if the corresponding bit of c is 0.
+// Otherwise it is the corresponding bit of b."
+//based on pocl (pocl/lib/kernel/bitselect.cl)
+SIMPLE_3(uchar, bitselect, uchar, a, uchar, b, uchar, c, (~c & a) | (c & b))
+SIMPLE_3(char, bitselect, char, a, char, b, char, c, (~c & a) | (c & b))
+SIMPLE_3(ushort, bitselect, ushort, a, ushort, b, ushort, c, (~c & a) | (c & b))
+SIMPLE_3(short, bitselect, short, a, short, b, short, c, (~c & a) | (c & b))
+SIMPLE_3(uint, bitselect, uint, a, uint, b, uint, c, (~c & a) | (c & b))
+SIMPLE_3(int, bitselect, int, a, int, b, int, c, (~c & a) | (c & b))
+SIMPLE_3(ulong, bitselect, ulong, a, ulong, b, ulong, c, (~c & a) | (c & b))
+SIMPLE_3(long, bitselect, long, a, long, b, long, c, (~c & a) | (c & b))
+SIMPLE_3(float, bitselect, float, a, float, b, float, c, vc4cl_bitcast_float((~vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(a)) | (vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(b))))
+
+//"For a scalar type, result = c ? b : a."
+SELECT_SCALAR(uchar, uchar, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(uchar, char, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(char, uchar, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(char, char, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(ushort, ushort, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(ushort, short, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(short, ushort, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(short, short, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(uint, uint, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(uint, int, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(int, uint, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(int, int, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(ulong, ulong, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(ulong, long, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(long, ulong, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(long, long, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(float, uint, vc4cl_extend(c) ? b : a)
+SELECT_SCALAR(float, int, vc4cl_extend(c) ? b : a)
+
+//"For each component of a vector type, result[i] = if MSB of c[i] is set ? b[i] : a[i]"
+SELECT_VECTOR(uchar, uchar,
+{
+	int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
+	return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
+})
+SELECT_VECTOR(uchar, char,
+{
+	int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
+	return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
+})
+SELECT_VECTOR(char, char,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(char, uchar,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(ushort, ushort,
+{
+	int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
+	return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
+})
+SELECT_VECTOR(ushort, short,
+{
+	int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
+	return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
+})
+SELECT_VECTOR(short, short,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(short, ushort,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(uint, uint,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(uint, int,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(int, int,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(int, uint,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(ulong, ulong,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(ulong, long,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(long, long,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(long, ulong,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(float, uint,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+SELECT_VECTOR(float, int,
+{
+	return vc4cl_msb_set(c) ? b : a;
+})
+
+#undef COMPARISON_1
+#undef COMPARISON_2
+#undef FOR_ALL_ELEMENTS
+#undef SELECT_SCALAR
+#undef SELECT_VECTOR
+
+#endif /* VC4CL_RELATIONAL_H */
+
--- a/drivers/videocore4_stdlib/include/_spir_mangling.h
+++ b/drivers/videocore4_stdlib/include/_spir_mangling.h
--- a/drivers/videocore4_stdlib/include/_synchronization.h
+++ b/drivers/videocore4_stdlib/include/_synchronization.h
@ -0,0 +1,24 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_SYNCHRONIZATION_H
+#define VC4CL_SYNCHRONIZATION_H
+
+#include "_config.h"
+#include "_work_items.h"
+
+INLINE void barrier(cl_mem_fence_flags flags) OVERLOADABLE
+{
+	vc4cl_barrier(flags);
+}
+
+/*
+ * We do not declare read_mem_fence() and write_mem_fence(), since:
+ * - The SPIRV-LLVM-Translator (in older versions, e.g. 7.0) can't handle them passing a non-const flags to the mem_fence() function
+ * - We anyway handle mem_fence(), read_mem_fence() and write_mem_fence() in both front-ends the exact same way
+ */
+#endif /* VC4CL_SYNCHRONIZATION_H */
+
--- a/drivers/videocore4_stdlib/include/_vector.h
+++ b/drivers/videocore4_stdlib/include/_vector.h
@ -0,0 +1,265 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_VECTOR_H
+#define VC4CL_VECTOR_H
+
+#include "_config.h"
+#include "_overloads.h"
+
+#ifndef VECTOR_LOAD
+#define VECTOR_LOAD(type) \
+	INLINE type##2 vload2(size_t offset, const __global type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __global type##2 *)(ptr + offset * 2)); \
+	} \
+	INLINE type##3 vload3(size_t offset, const __global type * ptr) OVERLOADABLE \
+	{ \
+		return vc4cl_vload3(ptr + offset * 3); \
+	} \
+	INLINE type##4 vload4(size_t offset, const __global type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __global type##4 *)(ptr + offset * 4)); \
+	} \
+	INLINE type##8 vload8(size_t offset, const __global type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __global type##8 *)(ptr + offset * 8)); \
+	} \
+	INLINE type##16 vload16(size_t offset, const __global type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __global type##16 *)(ptr + offset * 16)); \
+	} \
+	INLINE type##2 vload2(size_t offset, const __local type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __local type##2 *)(ptr + offset * 2)); \
+	} \
+	INLINE type##3 vload3(size_t offset, const __local type * ptr) OVERLOADABLE \
+	{ \
+		return vc4cl_vload3(ptr + offset * 3); \
+	} \
+	INLINE type##4 vload4(size_t offset, const __local type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __local type##4 *)(ptr + offset * 4)); \
+	} \
+	INLINE type##8 vload8(size_t offset, const __local type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __local type##8 *)(ptr + offset * 8)); \
+	} \
+	INLINE type##16 vload16(size_t offset, const __local type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __local type##16 *)(ptr + offset * 16)); \
+	} \
+	INLINE type##2 vload2(size_t offset, const __constant type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __constant type##2 *)(ptr + offset * 2)); \
+	} \
+	INLINE type##3 vload3(size_t offset, const __constant type * ptr) OVERLOADABLE \
+	{ \
+		return vc4cl_vload3(ptr + offset * 3); \
+	} \
+	INLINE type##4 vload4(size_t offset, const __constant type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __constant type##4 *)(ptr + offset * 4)); \
+	} \
+	INLINE type##8 vload8(size_t offset, const __constant type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __constant type##8 *)(ptr + offset * 8)); \
+	} \
+	INLINE type##16 vload16(size_t offset, const __constant type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __constant type##16 *)(ptr + offset * 16)); \
+	} \
+	INLINE type##2 vload2(size_t offset, const __private type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __private type##2 *)(ptr + offset * 2)); \
+	} \
+	INLINE type##3 vload3(size_t offset, const __private type * ptr) OVERLOADABLE \
+	{ \
+		return vc4cl_vload3(ptr + offset * 3); \
+	} \
+	INLINE type##4 vload4(size_t offset, const __private type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __private type##4 *)(ptr + offset * 4)); \
+	} \
+	INLINE type##8 vload8(size_t offset, const __private type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __private type##8 *)(ptr + offset * 8)); \
+	} \
+	INLINE type##16 vload16(size_t offset, const __private type * ptr) OVERLOADABLE \
+	{ \
+		return *((const __private type##16 *)(ptr + offset * 16)); \
+	}
+#endif
+
+#ifndef VECTOR_STORE
+#define VECTOR_STORE(type) \
+	INLINE void vstore2(type##2 data, size_t offset, __global type * ptr) OVERLOADABLE \
+	{ \
+		*((__global type##2 *)(ptr + offset * 2)) = data; \
+	} \
+	INLINE void vstore3(type##3 data, size_t offset, __global type * ptr) OVERLOADABLE \
+	{ \
+		vc4cl_vstore3(ptr + offset * 3, data); \
+	} \
+	INLINE void vstore4(type##4 data, size_t offset, __global type * ptr) OVERLOADABLE \
+	{ \
+		*((__global type##4 *)(ptr + offset * 4)) = data; \
+	} \
+	INLINE void vstore8(type##8 data, size_t offset, __global type * ptr) OVERLOADABLE \
+	{ \
+		*((__global type##8 *)(ptr + offset * 8)) = data; \
+	} \
+	INLINE void vstore16(type##16 data, size_t offset, __global type * ptr) OVERLOADABLE \
+	{ \
+		*((__global type##16 *)(ptr + offset * 16)) = data; \
+	} \
+	INLINE void vstore2(type##2 data, size_t offset, __local type * ptr) OVERLOADABLE \
+	{ \
+		*((__local type##2 *)(ptr + offset * 2)) = data; \
+	} \
+	INLINE void vstore3(type##3 data, size_t offset, __local type * ptr) OVERLOADABLE \
+	{ \
+		vc4cl_vstore3(ptr + offset * 3, data); \
+	} \
+	INLINE void vstore4(type##4 data, size_t offset, __local type * ptr) OVERLOADABLE \
+	{ \
+		*((__local type##4 *)(ptr + offset * 4)) = data; \
+	} \
+	INLINE void vstore8(type##8 data, size_t offset, __local type * ptr) OVERLOADABLE \
+	{ \
+		*((__local type##8 *)(ptr + offset * 8)) = data; \
+	} \
+	INLINE void vstore16(type##16 data, size_t offset, __local type * ptr) OVERLOADABLE \
+	{ \
+		*((__local type##16 *)(ptr + offset * 16)) = data; \
+	} \
+	INLINE void vstore2(type##2 data, size_t offset, __private type * ptr) OVERLOADABLE \
+	{ \
+		*((__private type##2 *)(ptr + offset * 2)) = data; \
+	} \
+	INLINE void vstore3(type##3 data, size_t offset, __private type * ptr) OVERLOADABLE \
+	{ \
+		vc4cl_vstore3(ptr + offset * 3, data); \
+	} \
+	INLINE void vstore4(type##4 data, size_t offset, __private type * ptr) OVERLOADABLE \
+	{ \
+		*((__private type##4 *)(ptr + offset * 4)) = data; \
+	} \
+	INLINE void vstore8(type##8 data, size_t offset, __private type * ptr) OVERLOADABLE \
+	{ \
+		*((__private type##8 *)(ptr + offset * 8)) = data; \
+	} \
+	INLINE void vstore16(type##16 data, size_t offset, __private type * ptr) OVERLOADABLE \
+	{ \
+		*((__private type##16 *)(ptr + offset * 16)) = data; \
+	}
+#endif
+
+#ifndef VECTOR_SHUFFLE_2
+#define VECTOR_SHUFFLE_2_INTERNAL(type, maskType, num) \
+	INLINE type##2 shuffle2(type##num x, type##num y, maskType##2 mask) OVERLOADABLE \
+	{ \
+		return __builtin_shufflevector(x, y, mask.x, mask.y); \
+	} \
+	INLINE type##4 shuffle2(type##num x, type##num y, maskType##4 mask) OVERLOADABLE \
+	{ \
+		return __builtin_shufflevector(x, y, mask.x, mask.y, mask.z, mask.w); \
+	} \
+	INLINE type##8 shuffle2(type##num x, type##num y, maskType##8 mask) OVERLOADABLE \
+	{ \
+		return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7); \
+	} \
+	INLINE type##16 shuffle2(type##num x, type##num y, maskType##16 mask) OVERLOADABLE \
+	{ \
+		return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7, mask.s8, mask.s9, mask.sa, mask.sb, mask.sc, mask.sd, mask.se, mask.sf); \
+	} \
+
+#define VECTOR_SHUFFLE_2(type, maskType) \
+	VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 2) \
+	VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 4) \
+	VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 8) \
+	VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 16)
+#endif
+
+#ifndef VECTOR_SHUFFLE
+#define VECTOR_SHUFFLE_INTERNAL(type, maskType, num) \
+	INLINE type##2 shuffle(type##num val, maskType##2 mask) OVERLOADABLE \
+	{ \
+		return shuffle2(val, val, mask); \
+	} \
+	INLINE type##4 shuffle(type##num val, maskType##4 mask) OVERLOADABLE \
+	{ \
+		return shuffle2(val, val, mask); \
+	} \
+	INLINE type##8 shuffle(type##num val, maskType##8 mask) OVERLOADABLE \
+	{ \
+		return shuffle2(val, val, mask); \
+	} \
+	INLINE type##16 shuffle(type##num val, maskType##16 mask) OVERLOADABLE \
+	{ \
+		return shuffle2(val, val, mask); \
+	} \
+
+#define VECTOR_SHUFFLE(type, maskType) \
+	VECTOR_SHUFFLE_INTERNAL(type, maskType, 2) \
+	VECTOR_SHUFFLE_INTERNAL(type, maskType, 4) \
+	VECTOR_SHUFFLE_INTERNAL(type, maskType, 8) \
+	VECTOR_SHUFFLE_INTERNAL(type, maskType, 16)
+#endif
+
+VECTOR_LOAD(uchar)
+VECTOR_LOAD(char)
+VECTOR_LOAD(ushort)
+VECTOR_LOAD(short)
+VECTOR_LOAD(uint)
+VECTOR_LOAD(int)
+VECTOR_LOAD(float)
+VECTOR_LOAD(ulong)
+VECTOR_LOAD(long)
+
+VECTOR_STORE(uchar)
+VECTOR_STORE(char)
+VECTOR_STORE(ushort)
+VECTOR_STORE(short)
+VECTOR_STORE(uint)
+VECTOR_STORE(int)
+VECTOR_STORE(float)
+VECTOR_STORE(ulong)
+VECTOR_STORE(long)
+
+//TODO vload(a)_half, vload(a)_halfn (+rounding) (load half and return converted to float, possible with unpack-modes)
+//TODO vstore(a)_half, vstore(a)_halfn (+rounding) (store float as half in memory, possible with pack modes)
+
+/*
+ * TODO shuffle2, but LLVM fails, since the indices for the __builtin intrinsic need to be constant integers!
+VECTOR_SHUFFLE_2(uchar, uchar)
+VECTOR_SHUFFLE_2(char, uchar)
+VECTOR_SHUFFLE_2(ushort, ushort)
+VECTOR_SHUFFLE_2(short, ushort)
+VECTOR_SHUFFLE_2(uint, uint)
+VECTOR_SHUFFLE_2(int, uint)
+VECTOR_SHUFFLE_2(float, uint)
+
+VECTOR_SHUFFLE(uchar, uchar)
+VECTOR_SHUFFLE(char, uchar)
+VECTOR_SHUFFLE(ushort, ushort)
+VECTOR_SHUFFLE(short, ushort)
+VECTOR_SHUFFLE(uint, uint)
+VECTOR_SHUFFLE(int, uint)
+VECTOR_SHUFFLE(float, uint)
+*/
+
+//shuffle/shuffle2 are handled via intrinsifying the OpenCL function
+
+#undef VECTOR_LOAD
+#undef VECTOR_STORE
+#undef VECTOR_SHUFFLE_2_INTERNAL
+#undef VECTOR_SHUFFLE_2
+#undef VECTOR_SHUFFLE_INTERNAL
+#undef VECTOR_SHUFFLE
+
+#endif /* VC4CL_VECTOR_H */
+
--- a/drivers/videocore4_stdlib/include/_work_items.h
+++ b/drivers/videocore4_stdlib/include/_work_items.h
@ -0,0 +1,70 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_WORK_ITEMS_H
+#define VC4CL_WORK_ITEMS_H
+
+#include "_intrinsics.h"
+#include "_overloads.h"
+
+INLINE uint get_work_dim(void) OVERLOADABLE CONST
+{
+	return vc4cl_work_dimensions();
+}
+
+INLINE size_t get_global_size(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_global_size(dim);
+}
+
+INLINE size_t get_global_id(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_global_id(dim);
+}
+
+INLINE size_t get_local_size(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_local_size(dim);
+}
+
+INLINE size_t get_enqueued_local_size(uint dimindx) OVERLOADABLE CONST
+{
+	// "Returns the same value as that returned by get_local_size(dimindx) if the kernel is executed with a uniform
+	// work-group size."
+	return vc4cl_local_size(dimindx);
+}
+
+INLINE size_t get_local_id(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_local_id(dim);
+}
+
+INLINE size_t get_num_groups(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_num_groups(dim);
+}
+
+INLINE size_t get_group_id(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_group_id(dim);
+}
+
+INLINE size_t get_global_offset(uint dim) OVERLOADABLE CONST
+{
+	return vc4cl_global_offset(dim);
+}
+
+INLINE size_t get_global_linear_id() OVERLOADABLE CONST
+{
+	return vc4cl_global_linear_id();
+}
+
+INLINE size_t get_local_linear_id() OVERLOADABLE CONST
+{
+	return vc4cl_local_linear_id();
+}
+
+#endif /* VC4CL_WORK_ITEMS_H */
--- a/drivers/videocore4_stdlib/include/defines.h
+++ b/drivers/videocore4_stdlib/include/defines.h
@ -0,0 +1,105 @@
+/*
+ * Author: doe300
+ *
+ * See the file "LICENSE" for the full license governing this code.
+ */
+
+#ifndef VC4CL_DEFINES_H
+#define VC4CL_DEFINES_H
+
+#ifndef CL_VERSION_1_0
+#define CL_VERSION_1_0 100
+#endif
+#ifndef CL_VERSION_1_1
+#define CL_VERSION_1_1 110
+#endif
+#ifndef CL_VERSION_1_2
+#define CL_VERSION_1_2 120
+#endif
+#ifndef CL_VERSION_2_0
+#define CL_VERSION_2_0 200
+#endif
+#ifndef CL_VERSION_2_1
+#define CL_VERSION_2_1 210
+#endif
+#ifndef CL_VERSION_2_2
+#define CL_VERSION_2_2 220
+#endif
+
+#undef __OPENCL_VERSION__
+#define __OPENCL_VERSION__ CL_VERSION_1_2
+#undef __OPENCL_C_VERSION__
+#define __OPENCL_C_VERSION__ CL_VERSION_1_2
+#ifndef __ENDIAN_LITTLE__
+#define __ENDIAN_LITTLE__ 1
+#endif
+#ifndef __EMBEDDED_PROFILE__
+#define __EMBEDDED_PROFILE__ 1
+#endif
+//#ifndef __IMAGE_SUPPORT__
+//#define __IMAGE_SUPPORT__ 1
+//#endif
+#undef __IMAGE_SUPPORT__
+
+#ifndef cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_base_atomics
+#endif
+#ifndef cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_base_atomics
+#endif
+#ifndef cl_khr_global_int32_extended_atomics
+#define cl_khr_global_int32_extended_atomics
+#endif
+#ifndef cl_khr_local_int32_extended_atomics
+#define cl_khr_local_int32_extended_atomics
+#endif
+#ifndef cl_khr_byte_addressable_store
+#define cl_khr_byte_addressable_store
+#endif
+#ifndef cl_khr_initialize_memory
+#define cl_khr_initialize_memory
+#endif
+
+#ifdef __IMAGE_SUPPORT__
+#ifndef cl_khr_3d_image_writes
+#define cl_khr_3d_image_writes
+#endif
+#ifndef cl_intel_packed_yuv
+#define cl_intel_packed_yuv
+#endif
+#else
+#undef cl_khr_3d_image_writes
+#undef cl_intel_packed_yuv
+#endif
+
+// additional supported extensions (need to set flag here, since the module is loaded too late)
+#define cl_nv_pragma_unroll 1
+#define cl_arm_core_id 1
+#define cl_ext_atomic_counters_32 1
+#define cl_arm_integer_dot_product_int8 1
+#define cl_arm_integer_dot_product_accumulate_int8 1
+#define cl_arm_integer_dot_product_accumulate_int16 1
+#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
+
+// unsupported extensions or optional core features
+#undef cl_khr_fp16
+#undef cl_khr_fp64
+#undef cl_khr_int64_base_atomics
+#undef cl_khr_int64_extended_atomics
+#undef cl_khr_depth_images
+#undef cl_khr_gl_depth_images
+#undef cl_khr_gl_msaa_sharing
+#undef cl_amd_media_ops
+#undef cl_amd_media_ops2
+// unsupported host-only extensions (disable for safety)
+#undef cl_khr_gl_sharing
+#undef cl_khr_gl_event
+#undef cl_khr_d3d10_sharing
+#undef cl_khr_dx9_media_sharing
+#undef cl_khr_d3d11_sharing
+#undef cl_khr_image2d_from_buffer
+#undef cl_khr_terminate_context
+#undef cl_khr_egl_image
+#undef cl_khr_egl_event
+
+#endif /* VC4CL_DEFINES_H */
--- a/drivers/videocore4_stdlib/include/opencl-c.h
+++ b/drivers/videocore4_stdlib/include/opencl-c.h