Skip to content

Commit e035d21

Browse files
authored
Merge pull request numpy#379 from howjmay/vcgt
feat: Add vcgt* and vcgtz* intrinsics
2 parents 1bf2eb5 + af137a3 commit e035d21

File tree

3 files changed

+468
-75
lines changed

3 files changed

+468
-75
lines changed

neon2rvv.h

Lines changed: 79 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2638,55 +2638,109 @@ FORCE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) {
26382638
return __riscv_vmerge_vvm_u32m1(vdupq_n_u32(0x0), vdupq_n_u32(UINT32_MAX), cmp_res, 4);
26392639
}
26402640

2641-
// FORCE_INLINE uint64x1_t vcgt_s64(int64x1_t a, int64x1_t b);
2641+
FORCE_INLINE uint64x1_t vcgt_s64(int64x1_t a, int64x1_t b) {
2642+
vbool64_t cmp_res = __riscv_vmsgt_vv_i64m1_b64(a, b, 1);
2643+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 1);
2644+
}
26422645

2643-
// FORCE_INLINE uint64x2_t vcgtq_s64(int64x2_t a, int64x2_t b);
2646+
FORCE_INLINE uint64x2_t vcgtq_s64(int64x2_t a, int64x2_t b) {
2647+
vbool64_t cmp_res = __riscv_vmsgt_vv_i64m1_b64(a, b, 2);
2648+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 2);
2649+
}
26442650

2645-
// FORCE_INLINE uint64x1_t vcgt_u64(uint64x1_t a, uint64x1_t b);
2651+
FORCE_INLINE uint64x1_t vcgt_u64(uint64x1_t a, uint64x1_t b) {
2652+
vbool64_t cmp_res = __riscv_vmsgeu_vv_u64m1_b64(a, b, 1);
2653+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 1);
2654+
}
26462655

2647-
// FORCE_INLINE uint64x2_t vcgtq_u64(uint64x2_t a, uint64x2_t b);
2656+
FORCE_INLINE uint64x2_t vcgtq_u64(uint64x2_t a, uint64x2_t b) {
2657+
vbool64_t cmp_res = __riscv_vmsgeu_vv_u64m1_b64(a, b, 2);
2658+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 2);
2659+
}
26482660

2649-
// FORCE_INLINE uint64x1_t vcgt_f64(float64x1_t a, float64x1_t b);
2661+
FORCE_INLINE uint64x1_t vcgt_f64(float64x1_t a, float64x1_t b) {
2662+
vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(a, b, 1);
2663+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 1);
2664+
}
26502665

2651-
// FORCE_INLINE uint64x2_t vcgtq_f64(float64x2_t a, float64x2_t b);
2666+
FORCE_INLINE uint64x2_t vcgtq_f64(float64x2_t a, float64x2_t b) {
2667+
vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(a, b, 2);
2668+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 2);
2669+
}
26522670

2653-
// FORCE_INLINE uint64_t vcgtd_s64(int64_t a, int64_t b);
2671+
FORCE_INLINE uint64_t vcgtd_s64(int64_t a, int64_t b) { return a > b ? UINT64_MAX : 0x0; }
26542672

2655-
// FORCE_INLINE uint64_t vcgtd_u64(uint64_t a, uint64_t b);
2673+
FORCE_INLINE uint64_t vcgtd_u64(uint64_t a, uint64_t b) { return a > b ? UINT64_MAX : 0x0; }
26562674

2657-
// FORCE_INLINE uint32_t vcgts_f32(float32_t a, float32_t b);
2675+
FORCE_INLINE uint32_t vcgts_f32(float32_t a, float32_t b) { return a > b ? UINT32_MAX : 0x0; }
26582676

2659-
// FORCE_INLINE uint64_t vcgtd_f64(float64_t a, float64_t b);
2677+
FORCE_INLINE uint64_t vcgtd_f64(float64_t a, float64_t b) { return a > b ? UINT64_MAX : 0x0; }
26602678

2661-
// FORCE_INLINE uint8x8_t vcgtz_s8(int8x8_t a);
2679+
FORCE_INLINE uint8x8_t vcgtz_s8(int8x8_t a) {
2680+
vbool8_t cmp_res = __riscv_vmsgt_vx_i8m1_b8(a, 0, 8);
2681+
return __riscv_vmerge_vvm_u8m1(vdup_n_u8(0x0), vdup_n_u8(UINT8_MAX), cmp_res, 8);
2682+
}
26622683

2663-
// FORCE_INLINE uint8x16_t vcgtzq_s8(int8x16_t a);
2684+
FORCE_INLINE uint8x16_t vcgtzq_s8(int8x16_t a) {
2685+
vbool8_t cmp_res = __riscv_vmsgt_vx_i8m1_b8(a, 0, 16);
2686+
return __riscv_vmerge_vvm_u8m1(vdupq_n_u8(0x0), vdupq_n_u8(UINT8_MAX), cmp_res, 16);
2687+
}
26642688

2665-
// FORCE_INLINE uint16x4_t vcgtz_s16(int16x4_t a);
2689+
FORCE_INLINE uint16x4_t vcgtz_s16(int16x4_t a) {
2690+
vbool16_t cmp_res = __riscv_vmsgt_vx_i16m1_b16(a, 0, 4);
2691+
return __riscv_vmerge_vvm_u16m1(vdup_n_u16(0x0), vdup_n_u16(UINT16_MAX), cmp_res, 4);
2692+
}
26662693

2667-
// FORCE_INLINE uint16x8_t vcgtzq_s16(int16x8_t a);
2694+
FORCE_INLINE uint16x8_t vcgtzq_s16(int16x8_t a) {
2695+
vbool16_t cmp_res = __riscv_vmsgt_vx_i16m1_b16(a, 0, 8);
2696+
return __riscv_vmerge_vvm_u16m1(vdupq_n_u16(0x0), vdupq_n_u16(UINT16_MAX), cmp_res, 8);
2697+
}
26682698

2669-
// FORCE_INLINE uint32x2_t vcgtz_s32(int32x2_t a);
2699+
FORCE_INLINE uint32x2_t vcgtz_s32(int32x2_t a) {
2700+
vbool32_t cmp_res = __riscv_vmsgt_vx_i32m1_b32(a, 0, 2);
2701+
return __riscv_vmerge_vvm_u32m1(vdup_n_u32(0x0), vdup_n_u32(UINT32_MAX), cmp_res, 2);
2702+
}
26702703

2671-
// FORCE_INLINE uint32x4_t vcgtzq_s32(int32x4_t a);
2704+
FORCE_INLINE uint32x4_t vcgtzq_s32(int32x4_t a) {
2705+
vbool32_t cmp_res = __riscv_vmsgt_vx_i32m1_b32(a, 0, 4);
2706+
return __riscv_vmerge_vvm_u32m1(vdupq_n_u32(0x0), vdupq_n_u32(UINT32_MAX), cmp_res, 4);
2707+
}
26722708

2673-
// FORCE_INLINE uint64x1_t vcgtz_s64(int64x1_t a);
2709+
FORCE_INLINE uint64x1_t vcgtz_s64(int64x1_t a) {
2710+
vbool64_t cmp_res = __riscv_vmsgt_vx_i64m1_b64(a, 0, 1);
2711+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 1);
2712+
}
26742713

2675-
// FORCE_INLINE uint64x2_t vcgtzq_s64(int64x2_t a);
2714+
FORCE_INLINE uint64x2_t vcgtzq_s64(int64x2_t a) {
2715+
vbool64_t cmp_res = __riscv_vmsgt_vx_i64m1_b64(a, 0, 2);
2716+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 2);
2717+
}
26762718

2677-
// FORCE_INLINE uint32x2_t vcgtz_f32(float32x2_t a);
2719+
FORCE_INLINE uint32x2_t vcgtz_f32(float32x2_t a) {
2720+
vbool32_t cmp_res = __riscv_vmfgt_vf_f32m1_b32(a, 0, 2);
2721+
return __riscv_vmerge_vvm_u32m1(vdup_n_u32(0x0), vdup_n_u32(UINT32_MAX), cmp_res, 2);
2722+
}
26782723

2679-
// FORCE_INLINE uint32x4_t vcgtzq_f32(float32x4_t a);
2724+
FORCE_INLINE uint32x4_t vcgtzq_f32(float32x4_t a) {
2725+
vbool32_t cmp_res = __riscv_vmfgt_vf_f32m1_b32(a, 0, 4);
2726+
return __riscv_vmerge_vvm_u32m1(vdupq_n_u32(0x0), vdupq_n_u32(UINT32_MAX), cmp_res, 4);
2727+
}
26802728

2681-
// FORCE_INLINE uint64x1_t vcgtz_f64(float64x1_t a);
2729+
FORCE_INLINE uint64x1_t vcgtz_f64(float64x1_t a) {
2730+
vbool64_t cmp_res = __riscv_vmfgt_vf_f64m1_b64(a, 0, 1);
2731+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 1);
2732+
}
26822733

2683-
// FORCE_INLINE uint64x2_t vcgtzq_f64(float64x2_t a);
2734+
FORCE_INLINE uint64x2_t vcgtzq_f64(float64x2_t a) {
2735+
vbool64_t cmp_res = __riscv_vmfgt_vf_f64m1_b64(a, 0, 2);
2736+
return __riscv_vmerge_vvm_u64m1(vdupq_n_u64(0x0), vdupq_n_u64(UINT64_MAX), cmp_res, 2);
2737+
}
26842738

2685-
// FORCE_INLINE uint64_t vcgtzd_s64(int64_t a);
2739+
FORCE_INLINE uint64_t vcgtzd_s64(int64_t a) { return (a > 0) ? UINT64_MAX : 0x00; }
26862740

2687-
// FORCE_INLINE uint32_t vcgtzs_f32(float32_t a);
2741+
FORCE_INLINE uint32_t vcgtzs_f32(float32_t a) { return (a > 0) ? UINT32_MAX : 0x00; }
26882742

2689-
// FORCE_INLINE uint64_t vcgtzd_f64(float64_t a);
2743+
FORCE_INLINE uint64_t vcgtzd_f64(float64_t a) { return (a > 0) ? UINT64_MAX : 0x00; }
26902744

26912745
FORCE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) {
26922746
vbool8_t cmp_res = __riscv_vmsgtu_vv_u8m1_b8(a, b, 16);

0 commit comments

Comments
 (0)