Skip to content

Commit 8669aba

Browse files
julian-seward1cfallin
authored andcommitted
CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions WebAssembly/simd#237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to aarch64 `fmov` instruction. This has been generalised slightly so as to allow both 32- and 64-bit transfers. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (#2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. * A simple filetest has been added. There is no comprehensive testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
1 parent ee7b035 commit 8669aba

File tree

9 files changed

+144
-35
lines changed

9 files changed

+144
-35
lines changed

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3798,12 +3798,9 @@ pub(crate) fn define(
37983798
Inst::new(
37993799
"scalar_to_vector",
38003800
r#"
3801-
Scalar To Vector -- move a value out of a scalar register and into a vector register; the
3802-
scalar will be moved to the lowest-order bits of the vector register. Note that this
3803-
instruction is intended as a low-level legalization instruction and frontends should prefer
3804-
insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
3805-
types (e.g. integers) but not for others (e.g. floats).
3806-
"#,
3801+
Copies a scalar value to a vector value. The scalar is copied into the
3802+
least significant lane of the vector, and all other lanes will be zero.
3803+
"#,
38073804
&formats.unary,
38083805
)
38093806
.operands_in(vec![s])

cranelift/codegen/src/isa/aarch64/inst/args.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,15 @@ impl ScalarSize {
579579
}
580580
}
581581

582+
/// Convert to an integer operand size.
583+
pub fn operand_size(&self) -> OperandSize {
584+
match self {
585+
ScalarSize::Size32 => OperandSize::Size32,
586+
ScalarSize::Size64 => OperandSize::Size64,
587+
_ => panic!("Unexpected operand_size request for: {:?}", self),
588+
}
589+
}
590+
582591
/// Convert from a type into the smallest size that fits.
583592
pub fn from_ty(ty: Type) -> ScalarSize {
584593
Self::from_bits(ty_bits(ty))

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,12 +1651,13 @@ impl MachInstEmit for Inst {
16511651
};
16521652
sink.put4(enc_fround(top22, rd, rn));
16531653
}
1654-
&Inst::MovToFpu { rd, rn } => {
1655-
sink.put4(
1656-
0b100_11110_01_1_00_111_000000_00000_00000
1657-
| (machreg_to_gpr(rn) << 5)
1658-
| machreg_to_vec(rd.to_reg()),
1659-
);
1654+
&Inst::MovToFpu { rd, rn, size } => {
1655+
let template = match size {
1656+
ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
1657+
ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
1658+
_ => unreachable!(),
1659+
};
1660+
sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
16601661
}
16611662
&Inst::MovToVec { rd, rn, idx, size } => {
16621663
let (imm5, shift) = match size.lane_size() {

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1860,10 +1860,20 @@ fn test_aarch64_binemit() {
18601860
Inst::MovToFpu {
18611861
rd: writable_vreg(31),
18621862
rn: xreg(0),
1863+
size: ScalarSize::Size64,
18631864
},
18641865
"1F00679E",
18651866
"fmov d31, x0",
18661867
));
1868+
insns.push((
1869+
Inst::MovToFpu {
1870+
rd: writable_vreg(1),
1871+
rn: xreg(28),
1872+
size: ScalarSize::Size32,
1873+
},
1874+
"8103271E",
1875+
"fmov s1, w28",
1876+
));
18671877
insns.push((
18681878
Inst::MovToVec {
18691879
rd: writable_vreg(0),

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -877,10 +877,13 @@ pub enum Inst {
877877
rn: Reg,
878878
},
879879

880-
/// Move from a GPR to a scalar FP register.
880+
/// Move from a GPR to a vector register. The scalar value is parked in the lowest lane
881+
/// of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit
882+
/// transactions are supported.
881883
MovToFpu {
882884
rd: Writable<Reg>,
883885
rn: Reg,
886+
size: ScalarSize,
884887
},
885888

886889
/// Move to a vector element from a GPR.
@@ -1319,13 +1322,15 @@ impl Inst {
13191322
size: VectorSize::Size8x8
13201323
}]
13211324
} else {
1322-
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
1325+
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
1326+
// bits.
13231327
let tmp = alloc_tmp(RegClass::I64, I32);
13241328
let mut insts = Inst::load_constant(tmp, value as u64);
13251329

13261330
insts.push(Inst::MovToFpu {
13271331
rd,
13281332
rn: tmp.to_reg(),
1333+
size: ScalarSize::Size64,
13291334
});
13301335

13311336
insts
@@ -1340,16 +1345,17 @@ impl Inst {
13401345
) -> SmallVec<[Inst; 4]> {
13411346
if let Ok(const_data) = u32::try_from(const_data) {
13421347
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
1343-
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits.
1344-
// Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI
1345-
// might also be an option.
1348+
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
1349+
// bits. Also, treat it as half of a 128-bit vector and consider replicated
1350+
// patterns. Scalar MOVI might also be an option.
13461351
} else if const_data & (u32::MAX as u64) == 0 {
13471352
let tmp = alloc_tmp(RegClass::I64, I64);
13481353
let mut insts = Inst::load_constant(tmp, const_data);
13491354

13501355
insts.push(Inst::MovToFpu {
13511356
rd,
13521357
rn: tmp.to_reg(),
1358+
size: ScalarSize::Size64,
13531359
});
13541360

13551361
insts
@@ -1849,7 +1855,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
18491855
collector.add_def(rd);
18501856
collector.add_use(rn);
18511857
}
1852-
&Inst::MovToFpu { rd, rn } => {
1858+
&Inst::MovToFpu { rd, rn, .. } => {
18531859
collector.add_def(rd);
18541860
collector.add_use(rn);
18551861
}
@@ -2527,6 +2533,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
25272533
&mut Inst::MovToFpu {
25282534
ref mut rd,
25292535
ref mut rn,
2536+
..
25302537
} => {
25312538
map_def(mapper, rd);
25322539
map_use(mapper, rn);
@@ -3406,9 +3413,10 @@ impl Inst {
34063413
let rn = show_vreg_scalar(rn, mb_rru, size);
34073414
format!("{} {}, {}", inst, rd, rn)
34083415
}
3409-
&Inst::MovToFpu { rd, rn } => {
3410-
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
3411-
let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size64);
3416+
&Inst::MovToFpu { rd, rn, size } => {
3417+
let operand_size = size.operand_size();
3418+
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
3419+
let rn = show_ireg_sized(rn, mb_rru, operand_size);
34123420
format!("fmov {}, {}", rd, rn)
34133421
}
34143422
&Inst::MovToVec { rd, rn, idx, size } => {

cranelift/codegen/src/isa/aarch64/lower.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
837837
rd: Writable<Reg>,
838838
value: u128,
839839
) {
840-
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
841-
842-
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
843-
ctx.emit(inst);
840+
if value == 0 {
841+
// Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`,
842+
// is potentially expensive.
843+
ctx.emit(Inst::VecDupImm {
844+
rd,
845+
imm: ASIMDMovModImm::zero(),
846+
invert: false,
847+
size: VectorSize::Size8x16,
848+
});
849+
} else {
850+
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
851+
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
852+
ctx.emit(inst);
853+
}
844854
}
845855
}
846856

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
179179
let vb = ctx.alloc_tmp(RegClass::V128, I128);
180180
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
181181
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
182-
ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
183-
ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
182+
ctx.emit(Inst::MovToFpu {
183+
rd: va,
184+
rn: ra,
185+
size: ScalarSize::Size64,
186+
});
187+
ctx.emit(Inst::MovToFpu {
188+
rd: vb,
189+
rn: rb,
190+
size: ScalarSize::Size64,
191+
});
184192
ctx.emit(Inst::FpuRRR {
185193
fpu_op,
186194
rd: va,
@@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
17031711
}
17041712
(false, true) => {
17051713
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1706-
ctx.emit(Inst::MovToFpu { rd, rn });
1714+
ctx.emit(Inst::MovToFpu {
1715+
rd,
1716+
rn,
1717+
size: ScalarSize::Size64,
1718+
});
17071719
}
17081720
(true, false) => {
17091721
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
20562068
}
20572069
}
20582070

2071+
Opcode::ScalarToVector => {
2072+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2073+
let rd = get_output_reg(ctx, outputs[0]);
2074+
let input_ty = ctx.input_ty(insn, 0);
2075+
if (input_ty == I32 && ty.unwrap() == I32X4)
2076+
|| (input_ty == I64 && ty.unwrap() == I64X2)
2077+
{
2078+
ctx.emit(Inst::MovToFpu {
2079+
rd,
2080+
rn,
2081+
size: ScalarSize::from_ty(input_ty),
2082+
});
2083+
} else {
2084+
return Err(CodegenError::Unsupported(format!(
2085+
"ScalarToVector: unsupported types {:?} -> {:?}",
2086+
input_ty, ty
2087+
)));
2088+
}
2089+
}
2090+
20592091
Opcode::VanyTrue | Opcode::VallTrue => {
20602092
let rd = get_output_reg(ctx, outputs[0]);
20612093
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
23412373

23422374
Opcode::Vsplit
23432375
| Opcode::Vconcat
2344-
| Opcode::ScalarToVector
23452376
| Opcode::Uload8x8Complex
23462377
| Opcode::Sload8x8Complex
23472378
| Opcode::Uload16x4Complex
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
test compile
2+
target aarch64
3+
4+
function %f1() -> i64x2 {
5+
block0:
6+
v0 = iconst.i64 281474976710657
7+
v1 = scalar_to_vector.i64x2 v0
8+
return v1
9+
}
10+
11+
; check: stp fp, lr, [sp, #-16]!
12+
; nextln: mov fp, sp
13+
; nextln: movz x0, #1
14+
; nextln: movk x0, #1, LSL #48
15+
; nextln: fmov d0, x0
16+
; nextln: mov sp, fp
17+
; nextln: ldp fp, lr, [sp], #16
18+
; nextln: ret
19+
20+
function %f2() -> i32x4 {
21+
block0:
22+
v0 = iconst.i32 42679
23+
v1 = scalar_to_vector.i32x4 v0
24+
return v1
25+
}
26+
27+
; check: stp fp, lr, [sp, #-16]!
28+
; nextln: mov fp, sp
29+
; nextln: movz x0, #42679
30+
; nextln: fmov s0, w0
31+
; nextln: mov sp, fp
32+
; nextln: ldp fp, lr, [sp], #16
33+
; nextln: ret

cranelift/wasm/src/code_translator.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
14261426
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
14271427
state.push1(dfg.first_result(load))
14281428
}
1429+
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
1430+
translate_load(
1431+
memarg,
1432+
ir::Opcode::Load,
1433+
type_of(op).lane_type(),
1434+
builder,
1435+
state,
1436+
environ,
1437+
)?;
1438+
let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
1439+
state.push1(as_vector)
1440+
}
14291441
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
14301442
let vector = pop1_with_bitcast(state, type_of(op), builder);
14311443
let extracted = builder.ins().extractlane(vector, lane.clone());
@@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
17901802
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
17911803
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
17921804
}
1793-
1794-
Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
1795-
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
1796-
}
17971805
};
17981806
Ok(())
17991807
}
@@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
25162524
| Operator::I32x4MaxU
25172525
| Operator::F32x4ConvertI32x4S
25182526
| Operator::F32x4ConvertI32x4U
2519-
| Operator::I32x4Bitmask => I32X4,
2527+
| Operator::I32x4Bitmask
2528+
| Operator::V128Load32Zero { .. } => I32X4,
25202529

25212530
Operator::I64x2Splat
25222531
| Operator::V128Load64Splat { .. }
@@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
25282537
| Operator::I64x2ShrU
25292538
| Operator::I64x2Add
25302539
| Operator::I64x2Sub
2531-
| Operator::I64x2Mul => I64X2,
2540+
| Operator::I64x2Mul
2541+
| Operator::V128Load64Zero { .. } => I64X2,
25322542

25332543
Operator::F32x4Splat
25342544
| Operator::F32x4ExtractLane { .. }

0 commit comments

Comments
 (0)