diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl index cc22595444..3b434f3095 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -18,7 +18,7 @@ namespace accessors #define NBL_CONCEPT_PARAM_0 (accessor, T) #define NBL_CONCEPT_PARAM_1 (val, V) #define NBL_CONCEPT_PARAM_2 (index, I) -NBL_CONCEPT_BEGIN(3) + #define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 #define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 #define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 diff --git a/include/nbl/builtin/hlsl/workgroup/spd.hlsl b/include/nbl/builtin/hlsl/workgroup/spd.hlsl new file mode 100644 index 0000000000..62dd99189b --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup/spd.hlsl @@ -0,0 +1,178 @@ +#include +#include +#include + +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP_SPD_INCLUDED_ + +// ------------------------------- COMMON ----------------------------------------- + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup +{ +namespace spd +{ +namespace impl +{ + template + void subgroupQuadReduce(NBL_CONST_REF_ARG(Reducer) reducer, float32_t4 v) + { + const float32_t4 v0 = v; + const float32_t4 v1 = glsl::subgroupQuadSwapHorizontal(v); + const float32_t4 v2 = glsl::subgroupQuadSwapVertical(v); + const float32_t4 v3 = glsl::subgroupQuadSwapDiagonal(v); + return reducer.reduce(v0, v1, v2, v3); + } + + template + void downsampleMips_0_1(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) + { + float32_t4 v[4]; + + uint32_t x = coord.x; + uint32_t y = coord.y; + + int32_t2 tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2); + int32_t2 pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y); + v[0] = srcImage.reduce(tex, slice); + dstImage.set(pix, v[0], 0, slice); + + tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2); + pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y); + v[1] = srcImage.reduce(tex, slice); + dstImage.set(pix, v[1], 0, slice); + + tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2, y * 2 + 32); + pix = int32_t2(workGroupID.xy * 32) + int32_t2(x, y + 16); + v[2] = srcImage.set(pix, v[2], 0, slice); + dstImage.set(pix, v[2], 0, slice); + + tex = int32_t2(workGroupID.xy * 64) + int32_t2(x * 2 + 32, y * 2 + 32); + pix = int32_t2(workGroupID.xy * 32) + int32_t2(x + 16, y + 16); + v[3] = srcImage.set(pix, v[2], 0, slice); + dstImage.set(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + v[0] = subgroupQuadReduce(reducer, v[0]); + v[1] = subgroupQuadReduce(reducer, v[1]); + v[2] = subgroupQuadReduce(reducer, v[2]); + v[3] = subgroupQuadReduce(reducer, v[3]); + + if ((localInvocationIndex % 4) == 0) + { + dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2), v[0], 1, slice); + sharedMem.set(int32_t2(x / 2, y / 2), v[0]); + + dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2), v[1], 1, slice); + sharedMem.set(int32_t2(x / 2 + 8, y / 2), v[1]); + + dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2, y / 2 + 8), v[2], 1, slice); + sharedMem.set(int32_t2(x / 2, y / 2 + 8), v[2]); + + dstImage.set(int32_t2(workgroupID.xy * 16) + int32_t2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); + sharedMem.set(int32_t2(x / 2 + 8, y / 2 + 8), v[3]); + } + } + + template + void downsampleMip_2(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) + { + float32_t4 v = sharedMem.get(coord); + v = subgroupQuadReduce(reducer, v); + if (localInvocationIndex % 4 == 0) + { + dstImage.set(int32_t2(workGroupID.xy * 8) + int32_t2(coord.x / 2, coord.y / 2), v, mip, slice); + + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + sharedMem.set(int32_t2(coord.x + (coord.y / 2) % 2, coord.y), v); + } + } + + template + void downsampleMip_3(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) + { + if (localInvocationIndex < 64) + { + float32_t4 v = sharedMem.get(int32_t2(x * 2 + y % 2, y * 2)); + v = subgropuQuadReduce(reducer, v); + if (localInvocationIndex % 4 == 0) + { + dstImage.set(int32_t2(workGroupID.xy * 4) + int32_t2(x / 2, y / 2), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + sharedMem.set(int32_t2(x * 2 + y / 2, y * 2), v); + } + } + } + + template + void downsampleMip_4(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) + { + if (localInvocationIndex < 16) + { + float32_t4 v = sharedMem.get(int32_t2(x * 4 + y, y * 4)); + v = subgroupQuadReduce(reducer, v); + if (localInvocationIndex % 4 == 0) + { + dstImage.set(int32_t2(workGroupID.xy * 2), int32_t2(x / 2, y / 2), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + sharedMem.set(int32_t2(x / 2 + y, 0), v); + } + + } + } + + template + void downsampleMip_5(uint32_t2 coord, uint32_t2 workGroupID, uint32_t localInvocationIndex, uint32_t mip, uint32_t slice, NBL_COSNT_REF_ARG(Reducer) reducer, NBL_CONST_REF_ARG(SrcImageAccessor) srcImage, NBL_REF_ARG(DstImageAccessor) dstImage, NBL_REF_ARG(SharedMemoryAccessor) sharedMem) + { + if (localInvocationIndex < 4) + { + float32_t4 v = sharedMem.get(int32_t2(localInvocationIndex,0)); + v = subgroupQuadReduce(reducer, v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy), v, mip, slice); + } + } + } +} + +struct SPD +{ + + static void __call() + { + + } + +}; + + +} +} +} +} diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index bbadd5f6fc..e29ad84cfe 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -169,6 +169,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/raytracing.h LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_arithmetic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_basic.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_quad.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_shuffle.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/subgroup_vote.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/glsl.std.450.hlsl") @@ -187,6 +188,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/core.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_arithmetic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_ballot.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_basic.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_quad.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/glsl_compat/subgroup_shuffle.hlsl") #stdlib LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/algorithm.hlsl")