CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructions.

julian-seward1 · cfallin · commit 8669aba8c330 · 2020-11-30T11:26:34.000-08:00
This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions WebAssembly/simd#237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to aarch64 `fmov` instruction. This has been generalised slightly so as to allow both 32- and 64-bit transfers. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (#2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. * A simple filetest has been added. There is no comprehensive testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless.
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3798,12 +3798,9 @@ pub(crate) fn define(
         Inst::new(
             "scalar_to_vector",
             r#"
-    Scalar To Vector -- move a value out of a scalar register and into a vector register; the
-    scalar will be moved to the lowest-order bits of the vector register. Note that this
-    instruction is intended as a low-level legalization instruction and frontends should prefer
-    insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
-    types (e.g. integers) but not for others (e.g. floats).
-    "#,
+            Copies a scalar value to a vector value.  The scalar is copied into the
+            least significant lane of the vector, and all other lanes will be zero.
+            "#,
             &formats.unary,
         )
         .operands_in(vec![s])
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -579,6 +579,15 @@ impl ScalarSize {
         }
     }
 
+    /// Convert to an integer operand size.
+    pub fn operand_size(&self) -> OperandSize {
+        match self {
+            ScalarSize::Size32 => OperandSize::Size32,
+            ScalarSize::Size64 => OperandSize::Size64,
+            _ => panic!("Unexpected operand_size request for: {:?}", self),
+        }
+    }
+
     /// Convert from a type into the smallest size that fits.
     pub fn from_ty(ty: Type) -> ScalarSize {
         Self::from_bits(ty_bits(ty))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1651,12 +1651,13 @@ impl MachInstEmit for Inst {
                 };
                 sink.put4(enc_fround(top22, rd, rn));
             }
-            &Inst::MovToFpu { rd, rn } => {
-                sink.put4(
-                    0b100_11110_01_1_00_111_000000_00000_00000
-                        | (machreg_to_gpr(rn) << 5)
-                        | machreg_to_vec(rd.to_reg()),
-                );
+            &Inst::MovToFpu { rd, rn, size } => {
+                let template = match size {
+                    ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
+                    ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
+                    _ => unreachable!(),
+                };
+                sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
             }
             &Inst::MovToVec { rd, rn, idx, size } => {
                 let (imm5, shift) = match size.lane_size() {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1860,10 +1860,20 @@ fn test_aarch64_binemit() {
         Inst::MovToFpu {
             rd: writable_vreg(31),
             rn: xreg(0),
+            size: ScalarSize::Size64,
         },
         "1F00679E",
         "fmov d31, x0",
     ));
+    insns.push((
+        Inst::MovToFpu {
+            rd: writable_vreg(1),
+            rn: xreg(28),
+            size: ScalarSize::Size32,
+        },
+        "8103271E",
+        "fmov s1, w28",
+    ));
     insns.push((
         Inst::MovToVec {
             rd: writable_vreg(0),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -877,10 +877,13 @@ pub enum Inst {
         rn: Reg,
     },
 
-    /// Move from a GPR to a scalar FP register.
+    /// Move from a GPR to a vector register.  The scalar value is parked in the lowest lane
+    /// of the destination, and all other lanes are zeroed out.  Currently only 32- and 64-bit
+    /// transactions are supported.
     MovToFpu {
         rd: Writable<Reg>,
         rn: Reg,
+        size: ScalarSize,
     },
 
     /// Move to a vector element from a GPR.
@@ -1319,13 +1322,15 @@ impl Inst {
                 size: VectorSize::Size8x8
             }]
         } else {
-            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
+            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
+            // bits.
             let tmp = alloc_tmp(RegClass::I64, I32);
             let mut insts = Inst::load_constant(tmp, value as u64);
 
             insts.push(Inst::MovToFpu {
                 rd,
                 rn: tmp.to_reg(),
+                size: ScalarSize::Size64,
             });
 
             insts
@@ -1340,16 +1345,17 @@ impl Inst {
     ) -> SmallVec<[Inst; 4]> {
         if let Ok(const_data) = u32::try_from(const_data) {
             Inst::load_fp_constant32(rd, const_data, alloc_tmp)
-        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits.
-        // Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI
-        // might also be an option.
+        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
+        // bits.  Also, treat it as half of a 128-bit vector and consider replicated
+        // patterns. Scalar MOVI might also be an option.
         } else if const_data & (u32::MAX as u64) == 0 {
             let tmp = alloc_tmp(RegClass::I64, I64);
             let mut insts = Inst::load_constant(tmp, const_data);
 
             insts.push(Inst::MovToFpu {
                 rd,
                 rn: tmp.to_reg(),
+                size: ScalarSize::Size64,
             });
 
             insts
@@ -1849,7 +1855,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(rd);
             collector.add_use(rn);
         }
-        &Inst::MovToFpu { rd, rn } => {
+        &Inst::MovToFpu { rd, rn, .. } => {
             collector.add_def(rd);
             collector.add_use(rn);
         }
@@ -2527,6 +2533,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::MovToFpu {
             ref mut rd,
             ref mut rn,
+            ..
         } => {
             map_def(mapper, rd);
             map_use(mapper, rn);
@@ -3406,9 +3413,10 @@ impl Inst {
                 let rn = show_vreg_scalar(rn, mb_rru, size);
                 format!("{} {}, {}", inst, rd, rn)
             }
-            &Inst::MovToFpu { rd, rn } => {
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
-                let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size64);
+            &Inst::MovToFpu { rd, rn, size } => {
+                let operand_size = size.operand_size();
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, operand_size);
                 format!("fmov {}, {}", rd, rn)
             }
             &Inst::MovToVec { rd, rn, idx, size } => {
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
     rd: Writable<Reg>,
     value: u128,
 ) {
-    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
-
-    for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
-        ctx.emit(inst);
+    if value == 0 {
+        // Fast-track a common case.  The general case, viz, calling `Inst::load_fp_constant128`,
+        // is potentially expensive.
+        ctx.emit(Inst::VecDupImm {
+            rd,
+            imm: ASIMDMovModImm::zero(),
+            invert: false,
+            size: VectorSize::Size8x16,
+        });
+    } else {
+        let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+        for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
+            ctx.emit(inst);
+        }
     }
 }
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let vb = ctx.alloc_tmp(RegClass::V128, I128);
                 let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
                 let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
-                ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
-                ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
+                ctx.emit(Inst::MovToFpu {
+                    rd: va,
+                    rn: ra,
+                    size: ScalarSize::Size64,
+                });
+                ctx.emit(Inst::MovToFpu {
+                    rd: vb,
+                    rn: rb,
+                    size: ScalarSize::Size64,
+                });
                 ctx.emit(Inst::FpuRRR {
                     fpu_op,
                     rd: va,
@@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 }
                 (false, true) => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
-                    ctx.emit(Inst::MovToFpu { rd, rn });
+                    ctx.emit(Inst::MovToFpu {
+                        rd,
+                        rn,
+                        size: ScalarSize::Size64,
+                    });
                 }
                 (true, false) => {
                     let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::ScalarToVector => {
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = get_output_reg(ctx, outputs[0]);
+            let input_ty = ctx.input_ty(insn, 0);
+            if (input_ty == I32 && ty.unwrap() == I32X4)
+                || (input_ty == I64 && ty.unwrap() == I64X2)
+            {
+                ctx.emit(Inst::MovToFpu {
+                    rd,
+                    rn,
+                    size: ScalarSize::from_ty(input_ty),
+                });
+            } else {
+                return Err(CodegenError::Unsupported(format!(
+                    "ScalarToVector: unsupported types {:?} -> {:?}",
+                    input_ty, ty
+                )));
+            }
+        }
+
         Opcode::VanyTrue | Opcode::VallTrue => {
             let rd = get_output_reg(ctx, outputs[0]);
             let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::Vsplit
         | Opcode::Vconcat
-        | Opcode::ScalarToVector
         | Opcode::Uload8x8Complex
         | Opcode::Sload8x8Complex
         | Opcode::Uload16x4Complex
diff --git a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
@@ -0,0 +1,33 @@
+test compile
+target aarch64
+
+function %f1() -> i64x2 {
+block0:
+  v0 = iconst.i64 281474976710657
+  v1 = scalar_to_vector.i64x2 v0
+  return v1
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x0, #1
+; nextln:  movk x0, #1, LSL #48
+; nextln:  fmov d0, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f2() -> i32x4 {
+block0:
+  v0 = iconst.i32 42679
+  v1 = scalar_to_vector.i32x4 v0
+  return v1
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x0, #42679
+; nextln:  fmov s0, w0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
@@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
             state.push1(dfg.first_result(load))
         }
+        Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
+            translate_load(
+                memarg,
+                ir::Opcode::Load,
+                type_of(op).lane_type(),
+                builder,
+                state,
+                environ,
+            )?;
+            let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
+            state.push1(as_vector)
+        }
         Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
             let vector = pop1_with_bitcast(state, type_of(op), builder);
             let extracted = builder.ins().extractlane(vector, lane.clone());
@@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
             return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
         }
-
-        Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
-            return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
-        }
     };
     Ok(())
 }
@@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I32x4MaxU
         | Operator::F32x4ConvertI32x4S
         | Operator::F32x4ConvertI32x4U
-        | Operator::I32x4Bitmask => I32X4,
+        | Operator::I32x4Bitmask
+        | Operator::V128Load32Zero { .. } => I32X4,
 
         Operator::I64x2Splat
         | Operator::V128Load64Splat { .. }
@@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
         | Operator::I64x2ShrU
         | Operator::I64x2Add
         | Operator::I64x2Sub
-        | Operator::I64x2Mul => I64X2,
+        | Operator::I64x2Mul
+        | Operator::V128Load64Zero { .. } => I64X2,
 
         Operator::F32x4Splat
         | Operator::F32x4ExtractLane { .. }