Skip to content
This repository was archived by the owner on Feb 25, 2025. It is now read-only.

Commit 8d78da9

Browse files
Mike KleinSkia Commit-Bot
authored andcommitted
more arm64 ops
- easy: ceil, floor, sqrt - index is our first arm64 instruction to need a temporary, but other than that is pretty simple, just N - iota as usual. With Op::index now supported, `viewer --slide GM_runtime_shader` frame time drops from ~1ms to ~0.24ms. I accidentally swapped in a float-subtract for an int-subtract and everything worked fine. o_O Change-Id: I44c51506a6a9014b398d6943bb0e3712e4e52445 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/338661 Commit-Queue: Mike Klein <[email protected]> Reviewed-by: Mike Reed <[email protected]>
1 parent 420a9ba commit 8d78da9

File tree

3 files changed

+44
-11
lines changed

3 files changed

+44
-11
lines changed

src/core/SkVM.cpp

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2208,7 +2208,9 @@ namespace skvm {
22082208
void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
22092209
void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
22102210
void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2211-
void Assembler::fneg4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n, d); }
2211+
2212+
void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
2213+
void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
22122214

22132215
void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
22142216
void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
@@ -2238,6 +2240,8 @@ namespace skvm {
22382240
void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
22392241
void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
22402242
void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
2243+
void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
2244+
void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
22412245

22422246
void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
22432247
void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
@@ -2346,6 +2350,10 @@ namespace skvm {
23462350
this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
23472351
}
23482352

2353+
void Assembler::dup4s(V dst, X src) {
2354+
this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2355+
}
2356+
23492357
void Assembler::ld1r4s(V dst, X src) {
23502358
this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
23512359
}
@@ -3260,12 +3268,10 @@ namespace skvm {
32603268
return r;
32613269
};
32623270

3263-
#if defined(__x86_64__) || defined(_M_X64) // Nothing special... just unused on ARM.
32643271
auto free_tmp = [&](Reg r) {
32653272
SkASSERT(regs[r] == TMP);
32663273
regs[r] = NA;
32673274
};
3268-
#endif
32693275

32703276
// Which register holds dst,x,y,z for this instruction? NA if none does yet.
32713277
int rd = NA,
@@ -3710,11 +3716,13 @@ namespace skvm {
37103716
break;
37113717

37123718
#elif defined(__aarch64__)
3713-
default: // TODO
3714-
if (false) {
3715-
SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
3716-
}
3717-
return false;
3719+
case Op::store64:
3720+
case Op::store128:
3721+
case Op::load64:
3722+
case Op::load128:
3723+
case Op::to_half:
3724+
case Op::from_half:
3725+
return false; // TODO
37183726

37193727
case Op::assert_true: {
37203728
a->uminv4s(dst(), r(x)); // uminv acts like an all() across the vector.
@@ -3725,6 +3733,14 @@ namespace skvm {
37253733
a->label(&all_true);
37263734
} break;
37273735

3736+
case Op::index: {
3737+
A::V tmp = alloc_tmp();
3738+
a->ldrq (tmp, &iota);
3739+
a->dup4s(dst(), N);
3740+
a->sub4s(dst(), dst(), tmp);
3741+
free_tmp(tmp);
3742+
} break;
3743+
37283744
case Op::store8: a->xtns2h(dst(), r(x));
37293745
a->xtnh2b(dst(), dst());
37303746
if (scalar) { a->strb (dst(), arg[immy]); }
@@ -3801,6 +3817,8 @@ namespace skvm {
38013817
case Op::mul_f32: a->fmul4s(dst(), r(x), r(y)); break;
38023818
case Op::div_f32: a->fdiv4s(dst(), r(x), r(y)); break;
38033819

3820+
case Op::sqrt_f32: a->fsqrt4s(dst(), r(x)); break;
3821+
38043822
case Op::fma_f32: // fmla.4s is z += x*y
38053823
if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
38063824
else { a->orr16b(dst(), r(z), r(z));
@@ -3864,8 +3882,8 @@ namespace skvm {
38643882
case Op::to_f32: a->scvtf4s (dst(), r(x)); break;
38653883
case Op::trunc: a->fcvtzs4s(dst(), r(x)); break;
38663884
case Op::round: a->fcvtns4s(dst(), r(x)); break;
3867-
// TODO: fcvtns.4s rounds to nearest even.
3868-
// I think we actually want frintx -> fcvtzs to round to current mode.
3885+
case Op::ceil: a->frintp4s(dst(), r(x)); break;
3886+
case Op::floor: a->frintm4s(dst(), r(x)); break;
38693887
#endif
38703888
}
38713889

src/core/SkVM.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,9 +307,12 @@ namespace skvm {
307307
using DOpN = void(V d, V n);
308308
DOpN not16b, // d = ~n
309309
fneg4s, // d = -n
310+
fsqrt4s, // d = sqrtf(n)
310311
scvtf4s, // int -> float
311312
fcvtzs4s, // truncate float -> int
312313
fcvtns4s, // round float -> int (nearest even)
314+
frintp4s, // round float -> int as float, toward plus infinity (ceil)
315+
frintm4s, // round float -> int as float, toward minus infinity (floor)
313316
xtns2h, // u32 -> u16
314317
xtnh2b, // u16 -> u8
315318
uxtlb2h, // u8 -> u16 (TODO: this is a special case of ushll.8h)
@@ -364,6 +367,8 @@ namespace skvm {
364367
void movs(X dst, V src, int lane); // dst = 32-bit src[lane]
365368
void inss(V dst, X src, int lane); // dst[lane] = 32-bit src
366369

370+
void dup4s (V dst, X src); // Each 32-bit lane = src
371+
367372
void ld1r4s (V dst, X src); // Each 32-bit lane = *src
368373
void ld1r8h (V dst, X src); // Each 16-bit lane = *src
369374
void ld1r16b(V dst, X src); // Each 8-bit lane = *src

tests/SkVMTest.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1670,7 +1670,9 @@ DEF_TEST(SkVM_Assembler, r) {
16701670
a.fdiv4s(A::v4, A::v3, A::v1);
16711671
a.fmin4s(A::v4, A::v3, A::v1);
16721672
a.fmax4s(A::v4, A::v3, A::v1);
1673-
a.fneg4s(A::v4, A::v3);
1673+
1674+
a.fneg4s (A::v4, A::v3);
1675+
a.fsqrt4s(A::v4, A::v3);
16741676

16751677
a.fmla4s(A::v4, A::v3, A::v1);
16761678
a.fmls4s(A::v4, A::v3, A::v1);
@@ -1702,7 +1704,9 @@ DEF_TEST(SkVM_Assembler, r) {
17021704
0x64,0xfc,0x21,0x6e,
17031705
0x64,0xf4,0xa1,0x4e,
17041706
0x64,0xf4,0x21,0x4e,
1707+
17051708
0x64,0xf8,0xa0,0x6e,
1709+
0x64,0xf8,0xa1,0x6e,
17061710

17071711
0x64,0xcc,0x21,0x4e,
17081712
0x64,0xcc,0xa1,0x4e,
@@ -1768,10 +1772,14 @@ DEF_TEST(SkVM_Assembler, r) {
17681772
a.scvtf4s (A::v4, A::v3);
17691773
a.fcvtzs4s(A::v4, A::v3);
17701774
a.fcvtns4s(A::v4, A::v3);
1775+
a.frintp4s(A::v4, A::v3);
1776+
a.frintm4s(A::v4, A::v3);
17711777
},{
17721778
0x64,0xd8,0x21,0x4e,
17731779
0x64,0xb8,0xa1,0x4e,
17741780
0x64,0xa8,0x21,0x4e,
1781+
0x64,0x88,0xa1,0x4e,
1782+
0x64,0x98,0x21,0x4e,
17751783
});
17761784

17771785
test_asm(r, [&](A& a) {
@@ -1928,10 +1936,12 @@ DEF_TEST(SkVM_Assembler, r) {
19281936
});
19291937

19301938
test_asm(r, [&](A& a) {
1939+
a.dup4s (A::v0, A::x8);
19311940
a.ld1r4s (A::v0, A::x8); // echo 'ld1r.4s {v0}, [x8]' | llvm-mc --show-encoding
19321941
a.ld1r8h (A::v0, A::x8);
19331942
a.ld1r16b(A::v0, A::x8);
19341943
},{
1944+
0x00,0x0d,0x04,0x4e,
19351945
0x00,0xc9,0x40,0x4d,
19361946
0x00,0xc5,0x40,0x4d,
19371947
0x00,0xc1,0x40,0x4d,

0 commit comments

Comments
 (0)