/*
 * Author: doe300
 *
 * See the file "LICENSE" for the full license governing this code.
 */

#ifndef VC4CL_ASYNC_H
#define VC4CL_ASYNC_H

#include "_config.h"
#include "_overloads.h"


/*
 * This is a synchronous/blocking implementation.
 * The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area.
 * Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item
 * (index 0, 0, 0). Idea taken from PoCL
 */

#define ASYNC_COPY_INTERNAL \
		if(vc4cl_local_id(0) == 0) \
		{ \
			vc4cl_mutex_lock(); \
			vc4cl_dma_copy(dst, src, num_elements); \
			vc4cl_mutex_unlock(); \
		}

#define ASYNC_COPY(type) \
		INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
		{ \
			ASYNC_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		}

#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
		for (size_t i = 0; i < num_elements; ++i) \
		dst[i] = src[i * src_stride];
//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?

#define ASYNC_STRIDED_DEST_COPY_INTERNAL \
		for (size_t i = 0; i < num_elements; ++i) \
		dst[i * dst_stride] = src[i];

#define ASYNC_STRIDED_COPY(type) \
		INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		} \
		INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
		{ \
			ASYNC_STRIDED_DEST_COPY_INTERNAL \
			return vc4cl_set_event(event); \
		}

#define PREFETCH(type) \
		INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		} \
		INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		} \
		INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		} \
		INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		} \
		INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		} \
		INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
		{ \
			vc4cl_prefetch(ptr, num_entries); \
		}

/*
 * OpenCL 1.2, page 278:
 * "Perform an async copy of num_gentypes gentype elements from src to dst.
 *  The async copy is performed by all work-items in a work-group and this built-in
 *  function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values."
 */
ASYNC_COPY(uchar)
ASYNC_COPY(char)
ASYNC_COPY(ushort)
ASYNC_COPY(short)
ASYNC_COPY(uint)
ASYNC_COPY(int)
ASYNC_COPY(float)

ASYNC_STRIDED_COPY(uchar)
ASYNC_STRIDED_COPY(char)
ASYNC_STRIDED_COPY(ushort)
ASYNC_STRIDED_COPY(short)
ASYNC_STRIDED_COPY(uint)
ASYNC_STRIDED_COPY(int)
ASYNC_STRIDED_COPY(float)

/*
 * OpenCL 1.2, page 279:
 * "Wait for events that identify the async_work_group_copy operations to complete.
 *  The event objects specified in event_list will be released after the wait is performed."
 */
INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE
{
	// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
	// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
	barrier(CLK_GLOBAL_MEM_FENCE);
}

/*
 * OpenCL 1.2, page 280:
 * "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.
 * The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel."
 *
 * -> Since it doesn't affect the functional behavior, the implementation is a no-op
 */
PREFETCH(uchar)
PREFETCH(char)
PREFETCH(ushort)
PREFETCH(short)
PREFETCH(uint)
PREFETCH(int)
PREFETCH(float)

#undef ASYNC_COPY_INTERNAL
#undef ASYNC_COPY
#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL
#undef ASYNC_STRIDED_DEST_COPY_INTERNAL
#undef ASYNC_STRIDED_COPY
#undef PREFETCH

#endif /* VC4CL_ASYNC_H */