forked from Qortal/Brooklyn
246 lines
9.2 KiB
C
246 lines
9.2 KiB
C
/*
|
|
* Author: doe300
|
|
*
|
|
* See the file "LICENSE" for the full license governing this code.
|
|
*/
|
|
|
|
#ifndef VC4CL_ASYNC_H
|
|
#define VC4CL_ASYNC_H
|
|
|
|
#include "_config.h"
|
|
#include "_overloads.h"
|
|
|
|
|
|
/*
|
|
* This is a synchronous/blocking implementation.
|
|
* The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area.
|
|
* Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item
|
|
* (index 0, 0, 0). Idea taken from PoCL
|
|
*/
|
|
|
|
#define ASYNC_COPY_INTERNAL \
|
|
if(vc4cl_local_id(0) == 0) \
|
|
{ \
|
|
vc4cl_mutex_lock(); \
|
|
vc4cl_dma_copy(dst, src, num_elements); \
|
|
vc4cl_mutex_unlock(); \
|
|
}
|
|
|
|
#define ASYNC_COPY(type) \
|
|
INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
}
|
|
|
|
#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
for (size_t i = 0; i < num_elements; ++i) \
|
|
dst[i] = src[i * src_stride];
|
|
//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?
|
|
|
|
#define ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
for (size_t i = 0; i < num_elements; ++i) \
|
|
dst[i * dst_stride] = src[i];
|
|
|
|
#define ASYNC_STRIDED_COPY(type) \
|
|
INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
} \
|
|
INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
|
{ \
|
|
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
|
return vc4cl_set_event(event); \
|
|
}
|
|
|
|
#define PREFETCH(type) \
|
|
INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
} \
|
|
INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
} \
|
|
INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
} \
|
|
INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
} \
|
|
INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
} \
|
|
INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
|
|
{ \
|
|
vc4cl_prefetch(ptr, num_entries); \
|
|
}
|
|
|
|
/*
|
|
* OpenCL 1.2, page 278:
|
|
* "Perform an async copy of num_gentypes gentype elements from src to dst.
|
|
* The async copy is performed by all work-items in a work-group and this built-in
|
|
* function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values."
|
|
*/
|
|
ASYNC_COPY(uchar)
|
|
ASYNC_COPY(char)
|
|
ASYNC_COPY(ushort)
|
|
ASYNC_COPY(short)
|
|
ASYNC_COPY(uint)
|
|
ASYNC_COPY(int)
|
|
ASYNC_COPY(float)
|
|
|
|
ASYNC_STRIDED_COPY(uchar)
|
|
ASYNC_STRIDED_COPY(char)
|
|
ASYNC_STRIDED_COPY(ushort)
|
|
ASYNC_STRIDED_COPY(short)
|
|
ASYNC_STRIDED_COPY(uint)
|
|
ASYNC_STRIDED_COPY(int)
|
|
ASYNC_STRIDED_COPY(float)
|
|
|
|
/*
|
|
* OpenCL 1.2, page 279:
|
|
* "Wait for events that identify the async_work_group_copy operations to complete.
|
|
* The event objects specified in event_list will be released after the wait is performed."
|
|
*/
|
|
INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE
|
|
{
|
|
// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
|
|
// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
|
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
}
|
|
|
|
/*
|
|
* OpenCL 1.2, page 280:
|
|
* "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.
|
|
* The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel."
|
|
*
|
|
* -> Since it doesn't affect the functional behavior, the implementation is a no-op
|
|
*/
|
|
PREFETCH(uchar)
|
|
PREFETCH(char)
|
|
PREFETCH(ushort)
|
|
PREFETCH(short)
|
|
PREFETCH(uint)
|
|
PREFETCH(int)
|
|
PREFETCH(float)
|
|
|
|
#undef ASYNC_COPY_INTERNAL
|
|
#undef ASYNC_COPY
|
|
#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL
|
|
#undef ASYNC_STRIDED_DEST_COPY_INTERNAL
|
|
#undef ASYNC_STRIDED_COPY
|
|
#undef PREFETCH
|
|
|
|
#endif /* VC4CL_ASYNC_H */
|
|
|