|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| 2 | +; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s |
| 3 | + |
| 4 | +define <vscale x 4 x i32> @dotp(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| 5 | +; CHECK-LABEL: dotp: |
| 6 | +; CHECK: // %bb.0: // %entry |
| 7 | +; CHECK-NEXT: udot z0.s, z1.b, z2.b |
| 8 | +; CHECK-NEXT: ret |
| 9 | +entry: |
| 10 | + %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| 11 | + %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| 12 | + %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| 13 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult) |
| 14 | + ret <vscale x 4 x i32> %partial.reduce |
| 15 | +} |
| 16 | + |
| 17 | +define <vscale x 2 x i64> @dotp_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| 18 | +; CHECK-LABEL: dotp_wide: |
| 19 | +; CHECK: // %bb.0: // %entry |
| 20 | +; CHECK-NEXT: udot z0.d, z1.h, z2.h |
| 21 | +; CHECK-NEXT: ret |
| 22 | +entry: |
| 23 | + %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| 24 | + %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| 25 | + %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| 26 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| 27 | + ret <vscale x 2 x i64> %partial.reduce |
| 28 | +} |
| 29 | + |
| 30 | +define <vscale x 4 x i32> @dotp_sext(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| 31 | +; CHECK-LABEL: dotp_sext: |
| 32 | +; CHECK: // %bb.0: // %entry |
| 33 | +; CHECK-NEXT: sdot z0.s, z1.b, z2.b |
| 34 | +; CHECK-NEXT: ret |
| 35 | +entry: |
| 36 | + %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> |
| 37 | + %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> |
| 38 | + %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide |
| 39 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult) |
| 40 | + ret <vscale x 4 x i32> %partial.reduce |
| 41 | +} |
| 42 | + |
| 43 | +define <vscale x 2 x i64> @dotp_wide_sext(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| 44 | +; CHECK-LABEL: dotp_wide_sext: |
| 45 | +; CHECK: // %bb.0: // %entry |
| 46 | +; CHECK-NEXT: sdot z0.d, z1.h, z2.h |
| 47 | +; CHECK-NEXT: ret |
| 48 | +entry: |
| 49 | + %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> |
| 50 | + %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> |
| 51 | + %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide |
| 52 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) |
| 53 | + ret <vscale x 2 x i64> %partial.reduce |
| 54 | +} |
| 55 | + |
| 56 | +define <vscale x 4 x i32> @not_dotp(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) { |
| 57 | +; CHECK-LABEL: not_dotp: |
| 58 | +; CHECK: // %bb.0: // %entry |
| 59 | +; CHECK-NEXT: and z1.h, z1.h, #0xff |
| 60 | +; CHECK-NEXT: and z2.h, z2.h, #0xff |
| 61 | +; CHECK-NEXT: ptrue p0.s |
| 62 | +; CHECK-NEXT: uunpklo z3.s, z1.h |
| 63 | +; CHECK-NEXT: uunpklo z4.s, z2.h |
| 64 | +; CHECK-NEXT: uunpkhi z1.s, z1.h |
| 65 | +; CHECK-NEXT: uunpkhi z2.s, z2.h |
| 66 | +; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s |
| 67 | +; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s |
| 68 | +; CHECK-NEXT: ret |
| 69 | +entry: |
| 70 | + %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32> |
| 71 | + %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32> |
| 72 | + %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide |
| 73 | + %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult) |
| 74 | + ret <vscale x 4 x i32> %partial.reduce |
| 75 | +} |
| 76 | + |
| 77 | +define <vscale x 2 x i64> @not_dotp_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) { |
| 78 | +; CHECK-LABEL: not_dotp_wide: |
| 79 | +; CHECK: // %bb.0: // %entry |
| 80 | +; CHECK-NEXT: and z1.s, z1.s, #0xffff |
| 81 | +; CHECK-NEXT: and z2.s, z2.s, #0xffff |
| 82 | +; CHECK-NEXT: ptrue p0.d |
| 83 | +; CHECK-NEXT: uunpklo z3.d, z1.s |
| 84 | +; CHECK-NEXT: uunpklo z4.d, z2.s |
| 85 | +; CHECK-NEXT: uunpkhi z1.d, z1.s |
| 86 | +; CHECK-NEXT: uunpkhi z2.d, z2.s |
| 87 | +; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d |
| 88 | +; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d |
| 89 | +; CHECK-NEXT: ret |
| 90 | +entry: |
| 91 | + %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64> |
| 92 | + %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64> |
| 93 | + %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide |
| 94 | + %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult) |
| 95 | + ret <vscale x 2 x i64> %partial.reduce |
| 96 | +} |
0 commit comments