diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 38cec50ac814e..b6f7b5c0e941d 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -1180,7 +1180,7 @@ void SYCLLowerESIMDLegacyPass::collectGenXVolatileType(Module &M) { if (!PTy) continue; auto GTy = dyn_cast(PTy->getPointerElementType()); - if (!GTy || !GTy->getName().endswith("cl::sycl::intel::gpu::simd")) + if (!GTy || !GTy->getName().endswith("cl::sycl::INTEL::gpu::simd")) continue; assert(GTy->getNumContainedTypes() == 1); auto VTy = GTy->getContainedType(0); @@ -1238,7 +1238,7 @@ PreservedAnalyses SYCLLowerESIMDPass::run(Function &F, // process ESIMD builtins that go through special handling instead of // the translation procedure - if (Name.startswith("N2cl4sycl5intel3gpu8slm_init")) { + if (Name.startswith("N2cl4sycl5INTEL3gpu8slm_init")) { // tag the kernel with meta-data SLMSize, and remove this builtin translateSLMInit(*CI); ESIMDToErases.push_back(CI); diff --git a/llvm/test/SYCLLowerIR/esimd_global.ll b/llvm/test/SYCLLowerIR/esimd_global.ll index ab86858fe4da7..c6a30cfb055be 100644 --- a/llvm/test/SYCLLowerIR/esimd_global.ll +++ b/llvm/test/SYCLLowerIR/esimd_global.ll @@ -6,18 +6,18 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown-sycldevice" -%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" = type { <16 x i32> } +%"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" = type { <16 x i32> } $"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test" = comdat any ; CHECK: [[NEWGLOBAL:[@a-zA-Z0-9_]*]] = dso_local global <16 x i32> zeroinitializer, align 64 #0 -@0 = dso_local global %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" zeroinitializer, align 64 #0 +@0 = dso_local global %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" zeroinitializer, align 64 #0 ; Function Attrs: norecurse define weak_odr dso_local spir_kernel void @"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test"(i32 addrspace(1)* %_arg_) local_unnamed_addr #1 comdat !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 !sycl_explicit_simd !12 !intel_reqd_sub_group_size !8 { entry: - %vc.i = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 - %agg.tmp.i = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 + %vc.i = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64 + %agg.tmp.i = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64 %call.esimd.i.i.i.i.i = call <3 x i32> @llvm.genx.local.id.v3i32() #5 %local_id.y.i.i.i.i.i = extractelement <3 x i32> %call.esimd.i.i.i.i.i, i32 1 %local_id.y.cast.ty.i.i.i.i.i = zext i32 %local_id.y.i.i.i.i.i to i64 @@ -36,15 +36,15 @@ entry: %group.id.x.cast.ty.i.i.i.i.i = zext i32 %group.id.x.i.i.i.i.i to i64 %mul.i4.i.i.i.i = mul nuw i64 %group.id.x.cast.ty.i.i.i.i.i, %wgsize.x.cast.ty.i.i.i.i.i %add.i5.i.i.i.i = add i64 %mul.i4.i.i.i.i, %local_id.x.cast.ty.i.i.i.i.i - %0 = bitcast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp.i to i8* + %0 = bitcast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %agg.tmp.i to i8* call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %0) - %1 = bitcast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %vc.i to i8* + %1 = bitcast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %vc.i to i8* call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %1) #5 %conv.i = trunc i64 %add.i5.i.i.i.i to i32 - %2 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %vc.i to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %2 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %vc.i to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %splat.splatinsert.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0 %splat.splat.i.i = shufflevector <16 x i32> %splat.splatinsert.i.i, <16 x i32> undef, <16 x i32> zeroinitializer - %M_data.i13.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %2, i64 0, i32 0 + %M_data.i13.i = getelementptr inbounds %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %2, i64 0, i32 0 store <16 x i32> %splat.splat.i.i, <16 x i32> addrspace(4)* %M_data.i13.i, align 64, !tbaa !13 %conv3.i = trunc i64 %add.i.i.i.i.i to i32 %splat.splatinsert.i20.i = insertelement <8 x i32> undef, i32 %conv3.i, i32 0 @@ -56,17 +56,17 @@ entry: %..i = select i1 %cmp.i, i64 %add.i5.i.i.i.i, i64 %add.i.i.i.i.i %conv9.i = trunc i64 %..i to i32 ; CHECK: store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds ({{.+}}, {{.+}}* bitcast (<16 x i32>* [[NEWGLOBAL]] to {{.+}}*), i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 - store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 + store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 %mul.i = shl nsw i32 %conv9.i, 4 %idx.ext.i = sext i32 %mul.i to i64 %add.ptr.i16 = getelementptr inbounds i32, i32 addrspace(1)* %_arg_, i64 %idx.ext.i %add.ptr.i = addrspacecast i32 addrspace(1)* %add.ptr.i16 to i32 addrspace(4)* - %3 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp.i to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %3 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %agg.tmp.i to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %call.esimd.i.i.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i13.i) #5 - %M_data.i2.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %3, i64 0, i32 0 + %M_data.i2.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %3, i64 0, i32 0 call void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %call.esimd.i.i.i, <16 x i32> addrspace(4)* %M_data.i2.i.i) #5 - call spir_func void @_Z3fooPiN2cl4sycl5intel3gpu4simdIiLi16EEE(i32 addrspace(4)* %add.ptr.i, %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* nonnull %agg.tmp.i) #5 - store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 + call spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %add.ptr.i, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* nonnull %agg.tmp.i) #5 + store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %1) #5 call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %0) ret void @@ -79,17 +79,17 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg %0, i8* nocapture %1) #2 declare void @llvm.lifetime.end.p0i8(i64 immarg %0, i8* nocapture %1) #2 ; Function Attrs: noinline norecurse nounwind -define dso_local spir_func void @_Z3fooPiN2cl4sycl5intel3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 { +define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 { entry: - %agg.tmp = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 - %0 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %v to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* - %1 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* - %M_data.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %0, i64 0, i32 0 + %agg.tmp = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64 + %0 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* + %1 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %agg.tmp to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* + %M_data.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %0, i64 0, i32 0 %call.esimd.i.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i.i), !noalias !17 ; CHECK: {{.+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* getelementptr ({{.+}}, {{.+}} addrspace(4)* addrspacecast ({{.+}}* bitcast (<16 x i32>* [[NEWGLOBAL]] to {{.+}}*) to {{.+}} addrspace(4)*), i64 0, i32 0)), !noalias !17 - %call.esimd.i8.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* getelementptr (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* addrspacecast (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0 to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)*), i64 0, i32 0)), !noalias !17 + %call.esimd.i8.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* getelementptr (%"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* addrspacecast (%"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* @0 to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)*), i64 0, i32 0)), !noalias !17 %add.i = add <16 x i32> %call.esimd.i8.i, %call.esimd.i.i - %M_data.i.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %1, i64 0, i32 0 + %M_data.i.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)* %1, i64 0, i32 0 call void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %add.i, <16 x i32> addrspace(4)* %M_data.i.i.i) %2 = ptrtoint i32 addrspace(4)* %C to i64 %call.esimd.i.i2 = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i.i.i) @@ -153,8 +153,8 @@ attributes #5 = { nounwind } !15 = !{!"Simple C++ TBAA"} !16 = !{i64 0, i64 64, !13} !17 = !{!18} -!18 = distinct !{!18, !19, !"_ZNK2cl4sycl5intel3gpu4simdIiLi16EEplERKS4_: %agg.result"} -!19 = distinct !{!19, !"_ZNK2cl4sycl5intel3gpu4simdIiLi16EEplERKS4_"} +!18 = distinct !{!18, !19, !"_ZNK2cl4sycl5INTEL3gpu4simdIiLi16EEplERKS4_: %agg.result"} +!19 = distinct !{!19, !"_ZNK2cl4sycl5INTEL3gpu4simdIiLi16EEplERKS4_"} !20 = !{i32 8275} !21 = !{i32 8268} !22 = !{i32 8269} diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index 5ed90614a675b..1e5e5754c8976 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -165,7 +165,7 @@ define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 { define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 { ; CHECK: define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 - call spir_func void @_ZN2cl4sycl5intel3gpu8slm_initEj(i32 1023) + call spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32 1023) ret void ; CHECK-NEXT: ret void } @@ -210,7 +210,7 @@ declare dso_local spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14 declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 %0, %opencl.image2d_wo_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5, <32 x i32> %6) declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0) declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1) -declare dso_local spir_func void @_ZN2cl4sycl5intel3gpu8slm_initEj(i32) +declare dso_local spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32) attributes #0 = { "genx_byte_offset"="192" "genx_volatile" } diff --git a/sycl/include/CL/sycl/intel/esimd.hpp b/sycl/include/CL/sycl/INTEL/esimd.hpp similarity index 80% rename from sycl/include/CL/sycl/intel/esimd.hpp rename to sycl/include/CL/sycl/INTEL/esimd.hpp index 7f4b7886d2d2c..5bef244eabd48 100644 --- a/sycl/include/CL/sycl/intel/esimd.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd.hpp @@ -10,10 +10,10 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #ifdef __SYCL_DEVICE_ONLY__ #define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd)) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_host_util.hpp similarity index 100% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_host_util.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_host_util.hpp diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp similarity index 85% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp index 23674ac3d3e91..34bbc905ceb44 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp @@ -11,9 +11,9 @@ #pragma once -#include -#include -#include +#include +#include +#include #include // \brief __esimd_rdregion: region access intrinsic. @@ -60,8 +60,8 @@ // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_rdregion(sycl::INTEL::gpu::vector_type_t Input, uint16_t Offset); // __esimd_wrregion returns the updated vector with the region updated. // @@ -112,14 +112,14 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset); // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_wrregion(sycl::intel::gpu::vector_type_t OldVal, - sycl::intel::gpu::vector_type_t NewVal, uint16_t Offset, - sycl::intel::gpu::mask_type_t Mask = 1); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_wrregion(sycl::INTEL::gpu::vector_type_t OldVal, + sycl::INTEL::gpu::vector_type_t NewVal, uint16_t Offset, + sycl::INTEL::gpu::mask_type_t Mask = 1); __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { // TODO dependencies on the std SYCL concepts like images // should be refactored in a separate header @@ -208,7 +208,7 @@ readRegion(const vector_type_t &Base, std::pair Region) { } } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -218,37 +218,37 @@ readRegion(const vector_type_t &Base, std::pair Region) { // optimization on simd object // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_vload(const sycl::intel::gpu::vector_type_t *ptr); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_vload(const sycl::INTEL::gpu::vector_type_t *ptr); // vstore // // map to the backend vstore intrinsic, used by compiler to control // optimization on simd object template -SYCL_EXTERNAL void __esimd_vstore(sycl::intel::gpu::vector_type_t *ptr, - sycl::intel::gpu::vector_type_t vals); +SYCL_EXTERNAL void __esimd_vstore(sycl::INTEL::gpu::vector_type_t *ptr, + sycl::INTEL::gpu::vector_type_t vals); template -SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t src); +SYCL_EXTERNAL uint16_t __esimd_any(sycl::INTEL::gpu::vector_type_t src); template -SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t src); +SYCL_EXTERNAL uint16_t __esimd_all(sycl::INTEL::gpu::vector_type_t src); #ifndef __SYCL_DEVICE_ONLY__ // Implementations of ESIMD intrinsics for the SYCL host device template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_rdregion(sycl::INTEL::gpu::vector_type_t Input, uint16_t Offset) { uint16_t EltOffset = Offset / sizeof(T); assert(Offset % sizeof(T) == 0); int NumRows = M / Width; assert(M % Width == 0); - sycl::intel::gpu::vector_type_t Result; + sycl::INTEL::gpu::vector_type_t Result; int Index = 0; for (int i = 0; i < NumRows; ++i) { for (int j = 0; j < Width; ++j) { @@ -260,17 +260,17 @@ __esimd_rdregion(sycl::intel::gpu::vector_type_t Input, uint16_t Offset) { template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_wrregion(sycl::intel::gpu::vector_type_t OldVal, - sycl::intel::gpu::vector_type_t NewVal, uint16_t Offset, - sycl::intel::gpu::mask_type_t Mask) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_wrregion(sycl::INTEL::gpu::vector_type_t OldVal, + sycl::INTEL::gpu::vector_type_t NewVal, uint16_t Offset, + sycl::INTEL::gpu::mask_type_t Mask) { uint16_t EltOffset = Offset / sizeof(T); assert(Offset % sizeof(T) == 0); int NumRows = M / Width; assert(M % Width == 0); - sycl::intel::gpu::vector_type_t Result = OldVal; + sycl::INTEL::gpu::vector_type_t Result = OldVal; int Index = 0; for (int i = 0; i < NumRows; ++i) { for (int j = 0; j < Width; ++j) { diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_math_intrin.hpp similarity index 91% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_math_intrin.hpp index c3f5a9d141305..dbe6f457b8c09 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_math_intrin.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_math_intrin.hpp @@ -11,12 +11,12 @@ #pragma once -#include -#include -#include +#include +#include +#include #include -using sycl::intel::gpu::vector_type_t; +using sycl::INTEL::gpu::vector_type_t; // saturation intrinsics template @@ -210,39 +210,39 @@ SYCL_EXTERNAL vector_type_t __esimd_dp4a(vector_type_t src0, // Reduction functions template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_fmax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_umax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_smax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_fmin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_umin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -sycl::intel::gpu::vector_type_t SYCL_EXTERNAL -__esimd_reduced_smin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2); +sycl::INTEL::gpu::vector_type_t SYCL_EXTERNAL +__esimd_reduced_smin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2); template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_dp4(sycl::intel::gpu::vector_type_t v1, - sycl::intel::gpu::vector_type_t v2); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_dp4(sycl::INTEL::gpu::vector_type_t v1, + sycl::INTEL::gpu::vector_type_t v2); #ifndef __SYCL_DEVICE_ONLY__ @@ -1096,10 +1096,10 @@ SYCL_EXTERNAL vector_type_t __esimd_dp4a(vector_type_t src0, }; template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_max(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_max(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { + sycl::INTEL::gpu::vector_type_t retv; for (int I = 0; I < N; I++) { if (src1[I] >= src2[I]) { retv[I] = src1[I]; @@ -1111,31 +1111,31 @@ __esimd_reduced_max(sycl::intel::gpu::vector_type_t src1, } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_fmax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_umax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smax(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_smax(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_max(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_min(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_min(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { + sycl::INTEL::gpu::vector_type_t retv; for (int I = 0; I < N; I++) { if (src1[I] <= src2[I]) { retv[I] = src1[I]; @@ -1147,23 +1147,23 @@ __esimd_reduced_min(sycl::intel::gpu::vector_type_t src1, } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_fmin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_fmin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_umin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_umin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_reduced_smin(sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t src2) { +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_reduced_smin(sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t src2) { return __esimd_reduced_min(src1, src2); } diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp similarity index 50% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp index d712fccf0d956..0c53470712ee6 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_memory_intrin.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp @@ -11,100 +11,100 @@ #pragma once -#include -#include -#include +#include +#include +#include #include // flat_read does flat-address gather template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t< - Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)> -__esimd_flat_read(sycl::intel::gpu::vector_type_t addrs, + sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None, + sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None> +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t< + Ty, N * sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_read(sycl::INTEL::gpu::vector_type_t addrs, int ElemsPerAddr = NumBlk, - sycl::intel::gpu::vector_type_t pred = 1); + sycl::INTEL::gpu::vector_type_t pred = 1); // flat_write does flat-address scatter template + sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None, + sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None> SYCL_EXTERNAL void -__esimd_flat_write(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t< - Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_write(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t< + Ty, N * sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk)> vals, int ElemsPerAddr = NumBlk, - sycl::intel::gpu::vector_type_t pred = 1); + sycl::INTEL::gpu::vector_type_t pred = 1); // flat_block_read reads a block of data from one flat address template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t + sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None, + sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None> +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_flat_block_read_unaligned(uint64_t addr); // flat_block_write writes a block of data using one flat address template + sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None, + sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None> SYCL_EXTERNAL void __esimd_flat_block_write(uint64_t addr, - sycl::intel::gpu::vector_type_t vals); + sycl::INTEL::gpu::vector_type_t vals); // Reads a block of data from given surface at given offset. template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset); // Writes given block of data to a surface with given index at given offset. template SYCL_EXTERNAL void __esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, - sycl::intel::gpu::vector_type_t vals); + sycl::INTEL::gpu::vector_type_t vals); // flat_read4 does flat-address gather4 -template -sycl::intel::gpu::vector_type_t SYCL_EXTERNAL -__esimd_flat_read4(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred = 1); +template +sycl::INTEL::gpu::vector_type_t SYCL_EXTERNAL +__esimd_flat_read4(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred = 1); // flat_write does flat-address scatter -template +template SYCL_EXTERNAL void __esimd_flat_write4( - sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred = 1); + sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred = 1); // flat_atomic: flat-address atomic -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic0(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred); - -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic1(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t pred); - -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic2(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t pred); +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic0(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic1(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic2(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t pred); // esimd_barrier, generic group barrier SYCL_EXTERNAL void __esimd_barrier(); @@ -114,59 +114,59 @@ SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl); // slm_read does SLM gather template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_read(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred = 1); +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_read(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred = 1); // slm_write does SLM scatter template SYCL_EXTERNAL void -__esimd_slm_write(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred = 1); +__esimd_slm_write(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred = 1); // slm_block_read reads a block of data from SLM template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_slm_block_read(uint32_t addr); // slm_block_write writes a block of data to SLM template SYCL_EXTERNAL void __esimd_slm_block_write(uint32_t addr, - sycl::intel::gpu::vector_type_t vals); + sycl::INTEL::gpu::vector_type_t vals); // slm_read4 does SLM gather4 -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_read4(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred = 1); +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_read4(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred = 1); // slm_write4 does SLM scatter4 -template +template SYCL_EXTERNAL void __esimd_slm_write4( - sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred = 1); + sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred = 1); // slm_atomic: SLM atomic -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic0(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred); - -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic1(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t pred); - -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic2(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t pred); +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic0(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic1(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t pred); + +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic2(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t pred); // Media block load // @@ -193,7 +193,7 @@ __esimd_slm_atomic2(sycl::intel::gpu::vector_type_t addrs, // @return the linearized 2D block data read from surface. // template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y); @@ -225,22 +225,22 @@ template SYCL_EXTERNAL void __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y, - sycl::intel::gpu::vector_type_t vals); + sycl::INTEL::gpu::vector_type_t vals); #ifndef __SYCL_DEVICE_ONLY__ -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t< - Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)> -__esimd_flat_read(sycl::intel::gpu::vector_type_t addrs, +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t< + Ty, N * sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_read(sycl::INTEL::gpu::vector_type_t addrs, int ElemsPerAddr, - sycl::intel::gpu::vector_type_t pred) { - auto NumBlkDecoded = sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk); - sycl::intel::gpu::vector_type_t< - Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)> + sycl::INTEL::gpu::vector_type_t pred) { + auto NumBlkDecoded = sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk); + sycl::INTEL::gpu::vector_type_t< + Ty, N * sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk)> V; - ElemsPerAddr = sycl::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr); + ElemsPerAddr = sycl::INTEL::gpu::ElemsPerAddrDecoding(ElemsPerAddr); for (int I = 0; I < N; I++) { if (pred[I]) { @@ -259,12 +259,12 @@ __esimd_flat_read(sycl::intel::gpu::vector_type_t addrs, return V; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_read4(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t V; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_read4(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t V; unsigned int Next = 0; if constexpr (HasR(Mask)) { @@ -307,17 +307,17 @@ __esimd_flat_read4(sycl::intel::gpu::vector_type_t addrs, return V; } -template +template SYCL_EXTERNAL void -__esimd_flat_write(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t< - Ty, N * sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk)> +__esimd_flat_write(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t< + Ty, N * sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk)> vals, int ElemsPerAddr, - sycl::intel::gpu::vector_type_t pred) { - auto NumBlkDecoded = sycl::intel::gpu::ElemsPerAddrDecoding(NumBlk); - ElemsPerAddr = sycl::intel::gpu::ElemsPerAddrDecoding(ElemsPerAddr); + sycl::INTEL::gpu::vector_type_t pred) { + auto NumBlkDecoded = sycl::INTEL::gpu::ElemsPerAddrDecoding(NumBlk); + ElemsPerAddr = sycl::INTEL::gpu::ElemsPerAddrDecoding(ElemsPerAddr); for (int I = 0; I < N; I++) { if (pred[I]) { @@ -335,13 +335,13 @@ __esimd_flat_write(sycl::intel::gpu::vector_type_t addrs, } } -template +template SYCL_EXTERNAL void __esimd_flat_write4( - sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t V; + sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t V; unsigned int Next = 0; if constexpr (HasR(Mask)) { @@ -382,11 +382,11 @@ SYCL_EXTERNAL void __esimd_flat_write4( } } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_flat_block_read_unaligned(uint64_t addr) { - sycl::intel::gpu::vector_type_t V; + sycl::INTEL::gpu::vector_type_t V; for (int I = 0; I < N; I++) { Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); @@ -395,11 +395,11 @@ __esimd_flat_block_read_unaligned(uint64_t addr) { return V; } -template +template SYCL_EXTERNAL void __esimd_flat_block_write(uint64_t addr, - sycl::intel::gpu::vector_type_t vals) { + sycl::INTEL::gpu::vector_type_t vals) { for (int I = 0; I < N; I++) { Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); *Addr = vals[I]; @@ -407,14 +407,14 @@ __esimd_flat_block_write(uint64_t addr, } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y) { // On host the input surface is modeled as sycl image 2d object, // and the read/write access is done through accessor, // which is passed in as the handle argument. - auto range = sycl::intel::gpu::AccessorPrivateProxy::getImageRange(handle); - unsigned bpp = sycl::intel::gpu::AccessorPrivateProxy::getElemSize(handle); + auto range = sycl::INTEL::gpu::AccessorPrivateProxy::getImageRange(handle); + unsigned bpp = sycl::INTEL::gpu::AccessorPrivateProxy::getElemSize(handle); unsigned vpp = bpp / sizeof(Ty); unsigned int i = x / bpp; unsigned int j = y; @@ -423,7 +423,7 @@ __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, unsigned int xbound = range[0] - 1; unsigned int ybound = range[1] - 1; - sycl::intel::gpu::vector_type_t vals; + sycl::INTEL::gpu::vector_type_t vals; for (int row = 0; row < M; row++) { for (int col = 0; col < N; col += vpp) { unsigned int xoff = (i > xbound) ? xbound : i; @@ -431,14 +431,14 @@ __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, auto coords = cl::sycl::cl_int2(xoff, yoff); cl::sycl::cl_uint4 data = handle.read(coords); - sycl::intel::gpu::vector_type_t res; + sycl::INTEL::gpu::vector_type_t res; for (int idx = 0; idx < 4; idx++) { res[idx] = data[idx]; } constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; - using refTy = sycl::intel::gpu::vector_type_t; + using refTy = sycl::INTEL::gpu::vector_type_t; auto ref = reinterpret_cast(res); unsigned int offset1 = col + row * N; @@ -461,10 +461,10 @@ template SYCL_EXTERNAL void __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y, - sycl::intel::gpu::vector_type_t vals) { - unsigned bpp = sycl::intel::gpu::AccessorPrivateProxy::getElemSize(handle); + sycl::INTEL::gpu::vector_type_t vals) { + unsigned bpp = sycl::INTEL::gpu::AccessorPrivateProxy::getElemSize(handle); unsigned vpp = bpp / sizeof(Ty); - auto range = sycl::intel::gpu::AccessorPrivateProxy::getImageRange(handle); + auto range = sycl::INTEL::gpu::AccessorPrivateProxy::getImageRange(handle); unsigned int i = x / bpp; unsigned int j = y; @@ -473,7 +473,7 @@ __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, for (int row = 0; row < M; row++) { for (int col = 0; col < N; col += vpp) { constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); - sycl::intel::gpu::vector_type_t res = 0; + sycl::INTEL::gpu::vector_type_t res = 0; unsigned int offset1 = col + row * N; unsigned int offset2 = 0; @@ -484,7 +484,7 @@ __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, offset2 += stride; } - using refTy = sycl::intel::gpu::vector_type_t; + using refTy = sycl::INTEL::gpu::vector_type_t; auto ref = reinterpret_cast(res); cl::sycl::cl_uint4 data; @@ -504,7 +504,7 @@ __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, } template -SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t src) { +SYCL_EXTERNAL uint16_t __esimd_any(sycl::INTEL::gpu::vector_type_t src) { for (unsigned int i = 0; i != N; i++) { if (src[i] != 0) return 1; @@ -513,7 +513,7 @@ SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t src) { } template -SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t src) { +SYCL_EXTERNAL uint16_t __esimd_all(sycl::INTEL::gpu::vector_type_t src) { for (unsigned int i = 0; i != N; i++) { if (src[i] == 0) return 0; @@ -522,10 +522,10 @@ SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t src) { } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_dp4(sycl::intel::gpu::vector_type_t v1, - sycl::intel::gpu::vector_type_t v2) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_dp4(sycl::INTEL::gpu::vector_type_t v1, + sycl::INTEL::gpu::vector_type_t v2) { + sycl::INTEL::gpu::vector_type_t retv; for (auto i = 0; i != N; i += 4) { Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]); @@ -543,25 +543,25 @@ SYCL_EXTERNAL void __esimd_barrier() {} SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl) {} template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_read(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_read(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } // slm_write does SLM scatter template SYCL_EXTERNAL void -__esimd_slm_write(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred) {} +__esimd_slm_write(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred) {} // slm_block_read reads a block of data from SLM template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_slm_block_read(uint32_t addr) { - sycl::intel::gpu::vector_type_t retv; + sycl::INTEL::gpu::vector_type_t retv; return retv; } @@ -569,93 +569,93 @@ __esimd_slm_block_read(uint32_t addr) { template SYCL_EXTERNAL void __esimd_slm_block_write(uint32_t addr, - sycl::intel::gpu::vector_type_t vals) {} + sycl::INTEL::gpu::vector_type_t vals) {} // slm_read4 does SLM gather4 -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_read4(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_read4(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } // slm_write4 does SLM scatter4 -template +template SYCL_EXTERNAL void __esimd_slm_write4( - sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t vals, - sycl::intel::gpu::vector_type_t pred) {} + sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t vals, + sycl::INTEL::gpu::vector_type_t pred) {} // slm_atomic: SLM atomic -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic0(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic0(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic1(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic1(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_slm_atomic2(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_slm_atomic2(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic0(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic0(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic1(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic1(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } -template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t -__esimd_flat_atomic2(sycl::intel::gpu::vector_type_t addrs, - sycl::intel::gpu::vector_type_t src0, - sycl::intel::gpu::vector_type_t src1, - sycl::intel::gpu::vector_type_t pred) { - sycl::intel::gpu::vector_type_t retv; +template +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t +__esimd_flat_atomic2(sycl::INTEL::gpu::vector_type_t addrs, + sycl::INTEL::gpu::vector_type_t src0, + sycl::INTEL::gpu::vector_type_t src1, + sycl::INTEL::gpu::vector_type_t pred) { + sycl::INTEL::gpu::vector_type_t retv; return retv; } template -SYCL_EXTERNAL sycl::intel::gpu::vector_type_t +SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t __esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) { throw cl::sycl::feature_not_supported(); - return sycl::intel::gpu::vector_type_t(); + return sycl::INTEL::gpu::vector_type_t(); } template SYCL_EXTERNAL void __esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, - sycl::intel::gpu::vector_type_t vals) { + sycl::INTEL::gpu::vector_type_t vals) { throw cl::sycl::feature_not_supported(); } diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_region.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_region.hpp index c1576415a882b..f995eb0b5e621 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_region.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_region.hpp @@ -17,7 +17,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { // The common base type of region types. @@ -114,6 +114,6 @@ template T getBaseRegion(std::pair Reg) { } } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_types.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_types.hpp index 7ff12e9113dda..f760d825d6a9d 100644 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_types.hpp @@ -10,17 +10,17 @@ #pragma once +#include +#include #include #include // to define C++14,17 extensions #include -#include -#include #include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { namespace csd = cl::sycl::detail; @@ -256,6 +256,6 @@ inline std::istream &operator>>(std::istream &I, half &rhs) { } } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_util.hpp similarity index 88% rename from sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp rename to sycl/include/CL/sycl/INTEL/esimd/detail/esimd_util.hpp index 4bd68905e069b..42ce828da229d 100755 --- a/sycl/include/CL/sycl/intel/esimd/detail/esimd_util.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_util.hpp @@ -69,7 +69,7 @@ static ESIMD_INLINE constexpr bool isPowerOf2(unsigned int n, __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) { @@ -84,11 +84,11 @@ template struct is_esimd_vector { static const bool value = false; }; template -struct is_esimd_vector> { +struct is_esimd_vector> { static const bool value = true; }; template -struct is_esimd_vector> { +struct is_esimd_vector> { static const bool value = true; }; @@ -106,12 +106,12 @@ struct is_dword_type }; template -struct is_dword_type> { +struct is_dword_type> { static const bool value = is_dword_type::value; }; template -struct is_dword_type> { +struct is_dword_type> { static const bool value = is_dword_type::value; }; @@ -124,11 +124,11 @@ struct is_word_type typename std::remove_const::type>::value> {}; template -struct is_word_type> { +struct is_word_type> { static const bool value = is_word_type::value; }; -template struct is_word_type> { +template struct is_word_type> { static const bool value = is_word_type::value; }; @@ -141,11 +141,11 @@ struct is_byte_type typename std::remove_const::type>::value> {}; template -struct is_byte_type> { +struct is_byte_type> { static const bool value = is_byte_type::value; }; -template struct is_byte_type> { +template struct is_byte_type> { static const bool value = is_byte_type::value; }; @@ -179,33 +179,33 @@ struct is_qword_type typename std::remove_const::type>::value> {}; template -struct is_qword_type> { +struct is_qword_type> { static const bool value = is_qword_type::value; }; template -struct is_qword_type> { +struct is_qword_type> { static const bool value = is_qword_type::value; }; // Extends to ESIMD vector types. template -struct is_fp_or_dword_type> { +struct is_fp_or_dword_type> { static const bool value = is_fp_or_dword_type::value; }; template -struct is_fp_or_dword_type> { +struct is_fp_or_dword_type> { static const bool value = is_fp_or_dword_type::value; }; /// Convert types into vector types template struct simd_type { - using type = sycl::intel::gpu::simd; + using type = sycl::INTEL::gpu::simd; }; template -struct simd_type> { - using type = sycl::intel::gpu::simd; +struct simd_type> { + using type = sycl::INTEL::gpu::simd; }; template struct simd_type { @@ -238,6 +238,6 @@ template <> struct word_type { using type = ushort; }; } // namespace details } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd.hpp rename to sycl/include/CL/sycl/INTEL/esimd/esimd.hpp index 757055dfa00fe..25f8e339fefd6 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd.hpp @@ -10,12 +10,12 @@ #pragma once -#include -#include +#include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { // @@ -444,14 +444,14 @@ ESIMD_INLINE simd convert(simd val) { } } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) #ifndef __SYCL_DEVICE_ONLY__ template std::ostream &operator<<(std::ostream &OS, - const sycl::intel::gpu::simd &V) { + const sycl::INTEL::gpu::simd &V) { OS << "{"; for (int I = 0; I < N; I++) { OS << V[I]; diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_enum.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp rename to sycl/include/CL/sycl/INTEL/esimd/esimd_enum.hpp index 4b901ea079119..78ceb5c0c4b97 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_enum.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_enum.hpp @@ -15,7 +15,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { using uchar = unsigned char; @@ -106,6 +106,6 @@ enum class CacheHint : uint8_t { } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_math.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/esimd/esimd_math.hpp rename to sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp index cd5b962dd60a2..1f241c63745cb 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_math.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp @@ -10,16 +10,16 @@ #pragma once -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { template @@ -1945,6 +1945,6 @@ simd esimd_dp4(simd v1, simd v2) { } } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp similarity index 98% rename from sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp rename to sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp index 77035cb16c9c3..8925f858b1fea 100644 --- a/sycl/include/CL/sycl/intel/esimd/esimd_memory.hpp +++ b/sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp @@ -10,17 +10,17 @@ #pragma once +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { template +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { namespace gpu { // @@ -380,6 +380,6 @@ template class simd_view { }; } // namespace gpu -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp b/sycl/include/CL/sycl/INTEL/fpga_device_selector.hpp similarity index 89% rename from sycl/include/CL/sycl/intel/fpga_device_selector.hpp rename to sycl/include/CL/sycl/INTEL/fpga_device_selector.hpp index d5f9cab31180c..3218cf9c9e427 100644 --- a/sycl/include/CL/sycl/intel/fpga_device_selector.hpp +++ b/sycl/include/CL/sycl/INTEL/fpga_device_selector.hpp @@ -12,7 +12,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { class platform_selector : public device_selector { private: @@ -20,7 +20,7 @@ class platform_selector : public device_selector { public: platform_selector(const std::string &platform_name) - : device_platform_name(platform_name){} + : device_platform_name(platform_name) {} int operator()(const device &device) const override { const platform &pf = device.get_platform(); @@ -39,14 +39,14 @@ static constexpr auto HARDWARE_PLATFORM_NAME = class fpga_selector : public platform_selector { public: - fpga_selector() : platform_selector(HARDWARE_PLATFORM_NAME){} + fpga_selector() : platform_selector(HARDWARE_PLATFORM_NAME) {} }; class fpga_emulator_selector : public platform_selector { public: - fpga_emulator_selector() : platform_selector(EMULATION_PLATFORM_NAME){} + fpga_emulator_selector() : platform_selector(EMULATION_PLATFORM_NAME) {} }; -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_extensions.hpp b/sycl/include/CL/sycl/INTEL/fpga_extensions.hpp similarity index 70% rename from sycl/include/CL/sycl/intel/fpga_extensions.hpp rename to sycl/include/CL/sycl/INTEL/fpga_extensions.hpp index 7140421fe5189..c2021fcfe7658 100644 --- a/sycl/include/CL/sycl/intel/fpga_extensions.hpp +++ b/sycl/include/CL/sycl/INTEL/fpga_extensions.hpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include -#include -#include +#include +#include +#include +#include diff --git a/sycl/include/CL/sycl/intel/fpga_lsu.hpp b/sycl/include/CL/sycl/INTEL/fpga_lsu.hpp similarity index 99% rename from sycl/include/CL/sycl/intel/fpga_lsu.hpp rename to sycl/include/CL/sycl/INTEL/fpga_lsu.hpp index 5f8d37f802e76..a52723c0c4a2e 100644 --- a/sycl/include/CL/sycl/intel/fpga_lsu.hpp +++ b/sycl/include/CL/sycl/INTEL/fpga_lsu.hpp @@ -13,7 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { constexpr uint8_t BURST_COALESCE = 0x1; constexpr uint8_t CACHE = 0x2; constexpr uint8_t STATICALLY_COALESCE = 0x4; @@ -108,6 +108,6 @@ template class lsu final { "unable to implement a store LSU with a prefetcher."); } }; -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/fpga_reg.hpp b/sycl/include/CL/sycl/INTEL/fpga_reg.hpp similarity index 89% rename from sycl/include/CL/sycl/intel/fpga_reg.hpp rename to sycl/include/CL/sycl/INTEL/fpga_reg.hpp index 0078dd66c383c..3a6e59b9ed87c 100644 --- a/sycl/include/CL/sycl/intel/fpga_reg.hpp +++ b/sycl/include/CL/sycl/INTEL/fpga_reg.hpp @@ -12,7 +12,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { template T fpga_reg(const T &t) { #if __has_builtin(__builtin_intel_fpga_reg) @@ -22,7 +22,7 @@ template T fpga_reg(const T &t) { #endif } -} // namespace intel +} // namespace INTEL } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) @@ -30,6 +30,6 @@ template T fpga_reg(const T &t) { // Currently clang does not support nested namespace for attributes namespace intelfpga { template T fpga_reg(const T &t) { - return cl::sycl::intel::fpga_reg(t); -} + return cl::sycl::INTEL::fpga_reg(t); } +} // namespace intelfpga diff --git a/sycl/include/CL/sycl/intel/fpga_utils.hpp b/sycl/include/CL/sycl/INTEL/fpga_utils.hpp similarity index 96% rename from sycl/include/CL/sycl/intel/fpga_utils.hpp rename to sycl/include/CL/sycl/INTEL/fpga_utils.hpp index be9bf1a6fc5af..7fdc64e94a246 100644 --- a/sycl/include/CL/sycl/intel/fpga_utils.hpp +++ b/sycl/include/CL/sycl/INTEL/fpga_utils.hpp @@ -13,7 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -namespace intel { +namespace INTEL { template