Closed
Description
The compiler should probably be able to optimize either of these functions to the other one, depending on cost for the particular hardware (Godbolt link):
export fn foo(x: u64, y: u64) u64 {
return x *% (y & (~y +% 1));
}
export fn bar(x: u64, y: u64) u64 {
if (y == 0) return 0;
return x << @intCast(@ctz(y));
}
x86 znver4 emit:
foo:
blsi rax, rsi
imul rax, rdi
ret
bar:
tzcnt rax, rsi
shlx rax, rdi, rax
cmovb rax, rsi
ret
RISC-V sifive_u74 emit:
foo:
neg a2, a1
and a1, a1, a2
mul a0, a1, a0
ret
.LCPI1_0:
.quad 151050438420815295
.LCPI1_1:
.ascii "\000\001\002\007\003\r\b\023\004\031\016\034\t\"\024(\005\021\032&\017.\0350\n\037#6\0252)9?\006\f\022\030\033!'\020%-/\036518>\013\027 $,47=\026+3<*;:"
bar:
beqz a1, .LBB1_2
lui a2, %hi(.LCPI1_0)
neg a3, a1
and a1, a1, a3
ld a2, %lo(.LCPI1_0)(a2)
mul a1, a1, a2
lui a2, %hi(.LCPI1_1)
srli a1, a1, 58
addi a2, a2, %lo(.LCPI1_1)
add a1, a1, a2
lbu a1, 0(a1)
sll a0, a0, a1
ret
.LBB1_2:
li a0, 0
ret