forked from Qortal/Brooklyn
174 lines
5.0 KiB
C
174 lines
5.0 KiB
C
|
/*
|
||
|
* Author: doe300
|
||
|
*
|
||
|
* See the file "LICENSE" for the full license governing this code.
|
||
|
*/
|
||
|
|
||
|
#ifndef VC4CL_EXTENSIONS_H
|
||
|
#define VC4CL_EXTENSIONS_H
|
||
|
|
||
|
#include "_config.h"
|
||
|
#include "_overloads.h"
|
||
|
#include "_intrinsics.h"
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Loop unroll pragma extension
|
||
|
*
|
||
|
* Defines "#pragma unroll <factor>"
|
||
|
*
|
||
|
* CLang supports this natively, so we do not need to do anything
|
||
|
*
|
||
|
* See https://www.khronos.org/registry/OpenCL/extensions/nv/cl_nv_pragma_unroll.txt
|
||
|
* See https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll
|
||
|
*/
|
||
|
#ifndef cl_nv_pragma_unroll
|
||
|
#define cl_nv_pragma_unroll 1
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
* ARM core-ID extension
|
||
|
*
|
||
|
* Adds function
|
||
|
* uint arm_get_core_id( void )
|
||
|
* which returns the ID of the OpenCL Computation Unit, which is always zero
|
||
|
*
|
||
|
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_get_core_id.txt
|
||
|
*/
|
||
|
#ifndef cl_arm_core_id
|
||
|
#define cl_arm_core_id 1
|
||
|
#endif
|
||
|
uint arm_get_core_id(void); //prototype, prevents warning
|
||
|
uint arm_get_core_id(void)
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* 32-bit atomic counters
|
||
|
*
|
||
|
* Adds type
|
||
|
* counter_32_t
|
||
|
* which is a 32-bit type for atomic counters. counter32_t can only be passed as kernel parameter and cannot be read/assigned.
|
||
|
*
|
||
|
* Adds functions
|
||
|
* uint atomic_inc(counter32_t counter)
|
||
|
* uint atomic_dec(counter32_t counter)
|
||
|
* increments/decrements the given counter32_t value atomically.
|
||
|
*
|
||
|
* NOTE: Since the syntax/semantics is exactly the same as for the uint version of the standard atomic_inc/atomic_dec functions, counter32_t is used as typedef to an uint pointer.
|
||
|
*
|
||
|
* See https://www.khronos.org/registry/OpenCL/extensions/ext/cl_ext_atomic_counters_32.txt
|
||
|
*/
|
||
|
#ifndef cl_ext_atomic_counters_32
|
||
|
#define cl_ext_atomic_counters_32 1
|
||
|
#endif
|
||
|
typedef volatile __global uint* counter32_t;
|
||
|
//just the prototypes, the implementations reside in _atomics.h
|
||
|
uint atomic_inc(counter32_t counter) OVERLOADABLE;
|
||
|
uint atomic_dec(counter32_t counter) OVERLOADABLE;
|
||
|
|
||
|
/*
|
||
|
* Integer dot products
|
||
|
*
|
||
|
* Adds functions
|
||
|
* int arm_dot(char4 a, char4 b)
|
||
|
* uint arm_dot(uchar4 a, uchar4 b)
|
||
|
* int arm_dot_acc(char4 a, char4 b, int acc)
|
||
|
* uint arm_dot_acc(uchar4 a, uchar4 b, uint acc)
|
||
|
* int arm_dot_acc(short2 a, short2 b, int acc)
|
||
|
* uint arm_dot_acc(ushort2 a, ushort2 b, uint acc)
|
||
|
* int arm_dot_acc_sat(char4 a, char4 b, int acc)
|
||
|
* uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc)
|
||
|
* calculate integer dot product (and additionally adds the scalar value).
|
||
|
* For the functions xxx_sat, the final addition is saturating.
|
||
|
*
|
||
|
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_integer_dot_product.txt
|
||
|
*/
|
||
|
#ifndef cl_arm_integer_dot_product_int8
|
||
|
#define cl_arm_integer_dot_product_int8 1
|
||
|
#endif
|
||
|
#ifndef cl_arm_integer_dot_product_accumulate_int8
|
||
|
#define cl_arm_integer_dot_product_accumulate_int8 1
|
||
|
#endif
|
||
|
#ifndef cl_arm_integer_dot_product_accumulate_int16
|
||
|
#define cl_arm_integer_dot_product_accumulate_int16 1
|
||
|
#endif
|
||
|
#ifndef cl_arm_integer_dot_product_accumulate_saturate_int8
|
||
|
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
|
||
|
#endif
|
||
|
|
||
|
// prototypes to prevent warnings
|
||
|
int arm_dot(char4 a, char4 b) OVERLOADABLE;
|
||
|
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE;
|
||
|
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE;
|
||
|
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
|
||
|
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE;
|
||
|
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE;
|
||
|
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE;
|
||
|
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
|
||
|
|
||
|
/**
|
||
|
* (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w)
|
||
|
*/
|
||
|
int arm_dot(char4 a, char4 b) OVERLOADABLE CONST
|
||
|
{
|
||
|
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||
|
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||
|
}
|
||
|
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE CONST
|
||
|
{
|
||
|
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||
|
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
|
||
|
*/
|
||
|
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||
|
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||
|
}
|
||
|
|
||
|
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||
|
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* acc + [ (a.x * b.x) + (a.y * b.y) ]
|
||
|
*/
|
||
|
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
int2 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||
|
return acc + tmp.s0 + tmp.s1;
|
||
|
}
|
||
|
|
||
|
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
uint2 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||
|
return acc + tmp.s0 + tmp.s1;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
|
||
|
*
|
||
|
* The final accumulation is saturating.
|
||
|
*/
|
||
|
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||
|
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
|
||
|
}
|
||
|
|
||
|
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
|
||
|
{
|
||
|
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||
|
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
|
||
|
}
|
||
|
|
||
|
#endif /* VC4CL_EXTENSIONS_H */
|
||
|
|