Skip to content

Commit 1efb50d

Browse files
CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions WebAssembly/simd#237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to the obvious aarch64 insns. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (bytecodealliance#2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
1 parent 5a5fb11 commit 1efb50d

File tree

4 files changed

+58
-17
lines changed

4 files changed

+58
-17
lines changed

cranelift/codegen/meta/src/shared/instructions.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3798,12 +3798,9 @@ pub(crate) fn define(
37983798
Inst::new(
37993799
"scalar_to_vector",
38003800
r#"
3801-
Scalar To Vector -- move a value out of a scalar register and into a vector register; the
3802-
scalar will be moved to the lowest-order bits of the vector register. Note that this
3803-
instruction is intended as a low-level legalization instruction and frontends should prefer
3804-
insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
3805-
types (e.g. integers) but not for others (e.g. floats).
3806-
"#,
3801+
Copies a scalar value to a vector value. The scalar is copied into the
3802+
least significant lane of the vector, and all other lanes will be zero.
3803+
"#,
38073804
&formats.unary,
38083805
)
38093806
.operands_in(vec![s])

cranelift/codegen/src/isa/aarch64/lower.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
837837
rd: Writable<Reg>,
838838
value: u128,
839839
) {
840-
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
841-
842-
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
843-
ctx.emit(inst);
840+
if value == 0 {
841+
// Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`,
842+
// is potentially expensive.
843+
ctx.emit(Inst::VecDupImm {
844+
rd,
845+
imm: ASIMDMovModImm::zero(),
846+
invert: false,
847+
size: VectorSize::Size8x16,
848+
});
849+
} else {
850+
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
851+
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
852+
ctx.emit(inst);
853+
}
844854
}
845855
}
846856

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,31 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
20562056
}
20572057
}
20582058

2059+
Opcode::ScalarToVector => {
2060+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
2061+
let rd = get_output_reg(ctx, outputs[0]);
2062+
let input_ty = ctx.input_ty(insn, 0);
2063+
let size = VectorSize::from_ty(ty.unwrap());
2064+
if (input_ty == I32 && ty.unwrap() == I32X4)
2065+
|| (input_ty == I64 && ty.unwrap() == I64X2)
2066+
{
2067+
//ctx.emit(Inst::VecMovImmZero { rd });
2068+
lower_constant_f128(ctx, rd, 0);
2069+
ctx.emit(Inst::MovToVec {
2070+
rd,
2071+
rn,
2072+
idx: 0,
2073+
size,
2074+
});
2075+
ctx.emit(Inst::Brk);
2076+
} else {
2077+
return Err(CodegenError::Unsupported(format!(
2078+
"ScalarToVector: unsupported types {:?} -> {:?}",
2079+
input_ty, ty
2080+
)));
2081+
}
2082+
}
2083+
20592084
Opcode::VanyTrue | Opcode::VallTrue => {
20602085
let rd = get_output_reg(ctx, outputs[0]);
20612086
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2341,7 +2366,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
23412366

23422367
Opcode::Vsplit
23432368
| Opcode::Vconcat
2344-
| Opcode::ScalarToVector
23452369
| Opcode::Uload8x8Complex
23462370
| Opcode::Sload8x8Complex
23472371
| Opcode::Uload16x4Complex

cranelift/wasm/src/code_translator.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
14261426
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
14271427
state.push1(dfg.first_result(load))
14281428
}
1429+
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
1430+
translate_load(
1431+
memarg,
1432+
ir::Opcode::Load,
1433+
type_of(op).lane_type(),
1434+
builder,
1435+
state,
1436+
environ,
1437+
)?;
1438+
let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
1439+
state.push1(as_vector)
1440+
}
14291441
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
14301442
let vector = pop1_with_bitcast(state, type_of(op), builder);
14311443
let extracted = builder.ins().extractlane(vector, lane.clone());
@@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
17901802
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
17911803
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
17921804
}
1793-
1794-
Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
1795-
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
1796-
}
17971805
};
17981806
Ok(())
17991807
}
@@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
25162524
| Operator::I32x4MaxU
25172525
| Operator::F32x4ConvertI32x4S
25182526
| Operator::F32x4ConvertI32x4U
2519-
| Operator::I32x4Bitmask => I32X4,
2527+
| Operator::I32x4Bitmask
2528+
| Operator::V128Load32Zero { .. } => I32X4,
25202529

25212530
Operator::I64x2Splat
25222531
| Operator::V128Load64Splat { .. }
@@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
25282537
| Operator::I64x2ShrU
25292538
| Operator::I64x2Add
25302539
| Operator::I64x2Sub
2531-
| Operator::I64x2Mul => I64X2,
2540+
| Operator::I64x2Mul
2541+
| Operator::V128Load64Zero { .. } => I64X2,
25322542

25332543
Operator::F32x4Splat
25342544
| Operator::F32x4ExtractLane { .. }

0 commit comments

Comments
 (0)