Skip to content

Commit 3756fd1

Browse files
authored
[ESIMD] Enable FADD/FSUB for slm_atomic_update (#13535)
Those test cases required newer GPU driver and thus were disabled previously. GPU driver on DG2 still does not handle correctly atomic_update for 'float' and 'half' types. GPU driver on PVC still does not handle correctly slm_atomic_update for 'half' types. --------- Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 004efa3 commit 3756fd1

File tree

4 files changed

+66
-84
lines changed

4 files changed

+66
-84
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5823,7 +5823,7 @@ slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
58235823
constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
58245824
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
58255825
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5826-
if constexpr (std::is_same_v<T, double>) {
5826+
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
58275827
return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
58285828
AddressScale, ImmOffset, EDS, VS,
58295829
Transposed, N>(pred.data(), offsets.data(),
@@ -5867,7 +5867,7 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
58675867
constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
58685868
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
58695869
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
5870-
if constexpr (std::is_same_v<T, double>) {
5870+
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
58715871
return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
58725872
AddressScale, ImmOffset, EDS, VS,
58735873
Transposed, N>(pred.data(), offsets.data(),
@@ -6007,11 +6007,11 @@ template <atomic_op Op, typename T, int N>
60076007
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
60086008
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
60096009
simd_mask<N> mask = 1) {
6010-
// 2 byte, 8 byte types, non-power of two, and operations wider than
6011-
// 32 are supported only by LSC.
6012-
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
6013-
!__ESIMD_DNS::isPowerOf2(N, 32)) {
6014-
// half and short are supported in LSC.
6010+
// Non-LSC atomic_update supports only 4-byte int vector operations with
6011+
// 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types.
6012+
if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax ||
6013+
Op == atomic_op::fadd || Op == atomic_op::fsub ||
6014+
sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
60156015
return slm_atomic_update_impl<Op, T, N,
60166016
detail::lsc_data_size::default_size>(
60176017
byte_offset, src0, mask);
@@ -6096,9 +6096,9 @@ template <atomic_op Op, typename T, int N>
60966096
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
60976097
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
60986098
simd<T, N> src1, simd_mask<N> mask = 1) {
6099-
// 2 byte, 8 byte types, non-power of two, and operations wider than
6100-
// 32 are supported only by LSC.
6101-
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
6099+
// Non-LSC atomic_update supports only 4-byte int vector operations with
6100+
// 1,2,4,8,16,32 vector length.
6101+
if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg ||
61026102
!__ESIMD_DNS::isPowerOf2(N, 32)) {
61036103
// 2-argument lsc_atomic_update arguments order matches the standard one -
61046104
// expected value first, then new value. But atomic_update uses reverse

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -670,16 +670,13 @@ template <int N, template <class, int> class Op, bool UseMask,
670670
bool UseLSCFeatures, bool UseAcc>
671671
bool test_fp_types(queue q, const Config &cfg) {
672672
bool passed = true;
673-
if constexpr (UseLSCFeatures) {
674-
if constexpr (std::is_same_v<Op<sycl::half, N>, ImplFmin<sycl::half, N>> ||
675-
std::is_same_v<Op<sycl::half, N>, ImplFmax<sycl::half, N>> ||
676-
std::is_same_v<Op<sycl::half, N>,
677-
ImplFcmpwr<sycl::half, N>>) {
678-
auto dev = q.get_device();
679-
if (dev.has(sycl::aspect::fp16)) {
680-
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(
681-
q, cfg);
682-
}
673+
// TODO: Enable FADD/FSUB on DG2/PVC when the error in GPU driver is resolved.
674+
if constexpr (UseLSCFeatures &&
675+
!std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
676+
!std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>) {
677+
if (q.get_device().has(sycl::aspect::fp16)) {
678+
passed &=
679+
run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(q, cfg);
683680
}
684681
}
685682
passed &= run_test<UseAcc, float, N, Op, UseMask, UseLSCFeatures>(q, cfg);
@@ -688,7 +685,6 @@ bool test_fp_types(queue q, const Config &cfg) {
688685
q.get_device().has(sycl::aspect::fp64)) {
689686
passed &= run_test<UseAcc, double, N, Op, UseMask, UseLSCFeatures>(q, cfg);
690687
}
691-
692688
#endif // CMPXCHG_TEST
693689
return passed;
694690
}
@@ -703,7 +699,6 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
703699
test_int_types<2, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
704700
passed &=
705701
test_int_types<4, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
706-
707702
passed &=
708703
test_int_types<8, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
709704
passed &=
@@ -715,13 +710,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
715710
if constexpr (UseLSCFeatures) {
716711
passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
717712
q, cfg);
718-
// non power of two values are supported only in newer driver.
719-
// TODO: Enable this when the new driver reaches test infrastructure
720-
// (v27556).
721-
#if 0
722-
passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
723-
passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
724-
#endif
713+
passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
714+
q, cfg);
715+
passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
716+
q, cfg);
725717
}
726718

727719
return passed;
@@ -734,21 +726,14 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) {
734726
passed &= test_fp_types<1, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
735727
passed &= test_fp_types<2, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
736728
passed &= test_fp_types<4, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
737-
738729
passed &= test_fp_types<8, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
739-
// Supported by LSC atomic:
730+
passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
731+
passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
732+
740733
if constexpr (UseLSCFeatures) {
741-
passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
742-
passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
743734
passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
744-
745-
// non power of two values are supported only in newer driver.
746-
// TODO: Enable this when the new driver reaches test infrastructure
747-
// (v27556).
748-
#if 0
749735
passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
750736
passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
751-
#endif
752737
}
753738
return passed;
754739
}

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp

Lines changed: 27 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -593,33 +593,24 @@ template <int N, template <class, int> class Op, bool UseMask,
593593
TestFeatures Features, bool UseAcc>
594594
bool test_fp_types(queue q) {
595595
bool passed = true;
596-
if constexpr (Features == TestFeatures::DG2 ||
597-
Features == TestFeatures::PVC) {
598-
if constexpr (std::is_same_v<Op<sycl::half, N>,
599-
ImplLSCFmax<sycl::half, N>> ||
600-
std::is_same_v<Op<sycl::half, N>,
601-
ImplLSCFmin<sycl::half, N>> ||
602-
std::is_same_v<Op<sycl::half, N>,
603-
ImplLSCFcmpwr<sycl::half, N>>) {
604-
auto dev = q.get_device();
605-
if (dev.has(sycl::aspect::fp16)) {
606-
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
607-
}
596+
597+
// TODO: Enable 'half' FADD/FSUB on DG2 when the error in GPU driver is fixed.
598+
if constexpr (Features == TestFeatures::PVC ||
599+
(Features == TestFeatures::DG2 &&
600+
!std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
601+
!std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>)) {
602+
if (q.get_device().has(sycl::aspect::fp16)) {
603+
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
608604
}
609605
}
610606

611607
passed &= run_test<UseAcc, float, N, Op, UseMask>(q);
612608

613609
if constexpr (Features == TestFeatures::DG2 ||
614610
Features == TestFeatures::PVC) {
615-
// TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
616-
if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
617-
!std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
618-
!std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
619-
if (q.get_device().has(sycl::aspect::atomic64) &&
620-
q.get_device().has(sycl::aspect::fp64)) {
621-
passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
622-
}
611+
if (q.get_device().has(sycl::aspect::atomic64) &&
612+
q.get_device().has(sycl::aspect::fp64)) {
613+
passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
623614
}
624615
}
625616
return passed;
@@ -633,7 +624,6 @@ bool test_int_types_and_sizes(queue q) {
633624
passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
634625
passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
635626
passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
636-
// TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
637627
if (UseMask && Features == TestFeatures::Generic &&
638628
esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
639629
"26918", "101.4953", false)) {
@@ -645,13 +635,8 @@ bool test_int_types_and_sizes(queue q) {
645635
if constexpr (Features == TestFeatures::DG2 ||
646636
Features == TestFeatures::PVC) {
647637
passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q);
648-
// non power of two values are supported only in newer driver.
649-
// TODO: Enable this when the new driver reaches test infrastructure
650-
// (v27556).
651-
#if 0
652638
passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q);
653639
passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q);
654-
#endif
655640
}
656641

657642
return passed;
@@ -672,13 +657,8 @@ bool test_fp_types_and_sizes(queue q) {
672657
if constexpr (Features == TestFeatures::DG2 ||
673658
Features == TestFeatures::PVC) {
674659
passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q);
675-
// non power of two values are supported only in newer driver.
676-
// TODO: Enable this when the new driver reaches test infrastructure
677-
// (v27556).
678-
#if 0
679660
passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q);
680661
passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q);
681-
#endif
682662
}
683663
return passed;
684664
}
@@ -705,29 +685,33 @@ int test_with_mask(queue q) {
705685
test_int_types_and_sizes<ImplUMin, UseMask, Features, UseAcc, Unsigned>(
706686
q);
707687

688+
// Check load/store operations.
689+
passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
690+
passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
691+
// 'float' 'load' and 'store' do not require DG2/PVC.
692+
passed &= test_fp_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
693+
passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
694+
708695
if constexpr (Features == TestFeatures::DG2 ||
709696
Features == TestFeatures::PVC) {
710697
passed &=
711698
test_fp_types_and_sizes<ImplLSCFmax, UseMask, Features, UseAcc>(q);
712699
passed &=
713700
test_fp_types_and_sizes<ImplLSCFmin, UseMask, Features, UseAcc>(q);
714-
715-
// TODO: fadd/fsub are emulated in the newer driver, but do not pass
716-
// validation.
717-
#if 0
701+
}
702+
// TODO: GPU driver promised to support FADD/FSUB on DG2, but it doesn't.
703+
// Report the issue to driver, enable FADD/FSUB for DG2 when it is fixed.
704+
if constexpr (Features == TestFeatures::PVC) {
718705
passed &= test_fp_types_and_sizes<ImplFadd, UseMask, Features, UseAcc>(q);
719706
passed &= test_fp_types_and_sizes<ImplFsub, UseMask, Features, UseAcc>(q);
720-
#endif
721-
722-
// Check load/store operations.
723-
passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
724-
passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
725-
passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
726707
}
727708
#else
728709
passed &= test_int_types_and_sizes<ImplCmpxchg, UseMask, Features, UseAcc>(q);
729-
passed &=
730-
test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
710+
if constexpr (Features == TestFeatures::DG2 ||
711+
Features == TestFeatures::PVC) {
712+
passed &=
713+
test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
714+
}
731715
#endif
732716
return passed;
733717
}

sycl/test/esimd/memory_properties.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -910,14 +910,14 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
910910
auto res_slm_atomic_0 =
911911
slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
912912
}
913-
// Expect DWORD for fmin.
913+
// Expect LSC for fmin.
914914
{
915915
constexpr int VL = 16;
916916
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(float);
917917
auto pred = simd_mask<VL>(1);
918918
simd<float, VL> min = simd<float, VL>(1) * sizeof(int);
919919

920-
// CHECK: call <16 x float> @llvm.genx.dword.atomic.fmin.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef)
920+
// CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 21, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef, i32 0, <16 x float> undef)
921921
auto res_slm_atomic_0 =
922922
slm_atomic_update<atomic_op::fmin, float>(offsets, min, pred);
923923
}
@@ -1038,6 +1038,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
10381038
auto res_slm_atomic_0 = slm_atomic_update<atomic_op::cmpxchg, int64_t>(
10391039
offsets, swap, compare, pred);
10401040
}
1041+
1042+
// Expect LSC for FP types.
1043+
{
1044+
constexpr int VL = 16;
1045+
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int64_t);
1046+
auto compare = simd<float, VL>(VL, 1);
1047+
auto swap = compare * 2;
1048+
auto pred = simd_mask<VL>(1);
1049+
1050+
// CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}} i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> {{[^)]+}}, i32 0, <16 x float> undef)
1051+
auto res_slm_atomic_0 = slm_atomic_update<atomic_op::fcmpxchg, float>(
1052+
offsets, swap, compare, pred);
1053+
}
10411054
}
10421055

10431056
// Test with local accessor.

0 commit comments

Comments
 (0)