CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.

julian-seward1 · julian-seward1 · commit 1efb50d44a86 · 2020-11-03T17:34:07.000+01:00
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions WebAssembly/simd#237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to the obvious aarch64 insns. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (bytecodealliance#2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. There is no testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3798,12 +3798,9 @@ pub(crate) fn define(
         Inst::new(
             "scalar_to_vector",
             r#"
-    Scalar To Vector -- move a value out of a scalar register and into a vector register; the
-    scalar will be moved to the lowest-order bits of the vector register. Note that this
-    instruction is intended as a low-level legalization instruction and frontends should prefer
-    insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
-    types (e.g. integers) but not for others (e.g. floats).
-    "#,
+            Copies a scalar value to a vector value.  The scalar is copied into the
+            least significant lane of the vector, and all other lanes will be zero.
+            "#,
             &formats.unary,
         )
         .operands_in(vec![s])
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
     rd: Writable<Reg>,
     value: u128,
 ) {
-    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
-
-    for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
-        ctx.emit(inst);
+    if value == 0 {
+        // Fast-track a common case.  The general case, viz, calling `Inst::load_fp_constant128`,
+        // is potentially expensive.
+        ctx.emit(Inst::VecDupImm {
+            rd,
+            imm: ASIMDMovModImm::zero(),
+            invert: false,
+            size: VectorSize::Size8x16,
+        });
+    } else {
+        let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+        for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
+            ctx.emit(inst);
+        }
     }
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2056,6 +2056,31 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::ScalarToVector => {
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let input_ty = ctx.input_ty(insn, 0);
+            let size = VectorSize::from_ty(ty.unwrap());
+            if (input_ty == I32 && ty.unwrap() == I32X4)
+                || (input_ty == I64 && ty.unwrap() == I64X2)
+            {
+                //ctx.emit(Inst::VecMovImmZero { rd });
+                lower_constant_f128(ctx, rd, 0);
+                ctx.emit(Inst::MovToVec {
+                    rd,
+                    rn,
+                    idx: 0,
+                    size,
+                });
+                ctx.emit(Inst::Brk);
+            } else {
+                return Err(CodegenError::Unsupported(format!(
+                    "ScalarToVector: unsupported types {:?} -> {:?}",
+                    input_ty, ty
+                )));
+            }
+        }
+
         Opcode::VanyTrue | Opcode::VallTrue => {
             let rd = get_output_reg(ctx, outputs[0]);
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2341,7 +2366,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Vsplit
         | Opcode::Vconcat
-        | Opcode::ScalarToVector
         | Opcode::Uload8x8Complex
         | Opcode::Sload8x8Complex
         | Opcode::Uload16x4Complex
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
@@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
             state.push1(dfg.first_result(load))
         }
+        Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
+            translate_load(
+                memarg,
+                ir::Opcode::Load,
+                type_of(op).lane_type(),
+                builder,
+                state,
+                environ,
+            )?;
+            let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
+            state.push1(as_vector)
+        }
         Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
             let vector = pop1_with_bitcast(state, type_of(op), builder);
             let extracted = builder.ins().extractlane(vector, lane.clone());
@@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
             return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
         }
-
-        Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
-            return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
-        }
     };
     Ok(())
 }
@@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I32x4MaxU
         | Operator::F32x4ConvertI32x4S
         | Operator::F32x4ConvertI32x4U
-        | Operator::I32x4Bitmask => I32X4,
+        | Operator::I32x4Bitmask
+        | Operator::V128Load32Zero { .. } => I32X4,
 
         Operator::I64x2Splat
         | Operator::V128Load64Splat { .. }
@@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I64x2ShrU
         | Operator::I64x2Add
         | Operator::I64x2Sub
-        | Operator::I64x2Mul => I64X2,
+        | Operator::I64x2Mul
+        | Operator::V128Load64Zero { .. } => I64X2,
 
         Operator::F32x4Splat
         | Operator::F32x4ExtractLane { .. }