Skip to content

RISC-V generates worse code than AArch64 for simple memset style loop at -Os #67595

Open
@hiraditya

Description

@hiraditya

Derived from: #66652

#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
  for (size_t i = 0; i < l; i++) a[i] = v;
}

riscv-clang -Os -march=rv64gcv_zba_zbb_zbs

fill_i16:                               # @fill_i16
        beqz    a2, .LBB0_5
        not     a4, a2
        csrr    a7, vlenb
        bgeu    a4, a7, .LBB0_3
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        sh      a1, 0(a0)
        addi    a2, a2, -1
        addi    a0, a0, 2
        bnez    a2, .LBB0_2
        j       .LBB0_5
.LBB0_3:
        li      a4, 0
        srli    a6, a7, 3
        neg     a5, a7
        add     a3, a7, a2
        addi    a3, a3, -1
        and     a5, a5, a3
        vsetvli a3, zero, e16, m2, ta, ma
        vmv.v.x v8, a1
        slli    a1, a6, 4
        vsetvli zero, zero, e64, m8, ta, ma
        vid.v   v16
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
        vsaddu.vx       v24, v16, a4
        vmsltu.vx       v0, v24, a2
        vse16.v v8, (a0), v0.t
        add     a4, a4, a7
        add     a0, a0, a1
        bne     a5, a4, .LBB0_4
.LBB0_5:
        ret

arm-clang -Os -march=armv8-a+sve

fill_i16:                               // @fill_i16
        cbz     x2, .LBB0_3
        cnth    x8
        mov     z0.h, w1
        mov     x10, xzr
        subs    x9, x2, x8
        csel    x9, xzr, x9, lo
        whilelo p0.h, xzr, x2
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        st1h    { z0.h }, p0, [x0, x10, lsl #1]
        whilelo p0.h, x10, x9
        add     x10, x10, x8
        b.mi    .LBB0_2
.LBB0_3:
        ret

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions