Skip to content

Commit abcc1ec

Browse files
Mike KleinSkia Commit-Bot
authored andcommitted
plumb register aliasing hints through on arm64
There's no reason not to do this, though there are so many registers on arm64 that I doubt we'll see any speed difference here at all. I let dst() take a second hint, which makes most of these super easy; double hints don't really come up on x86 because we've got all that any() register-or-memory-address complexity to deal with instead there. The most subtle bit is that it's safe to alias the index and destination registers of the gather ops... we pull an index out of a lane, load the value, and shove it back into that same lane, all totally safe. Change-Id: I0f28ead95922e99e712ccb2cf824bf2610f556a6 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/340721 Commit-Queue: Herb Derby <[email protected]> Auto-Submit: Mike Klein <[email protected]> Reviewed-by: Herb Derby <[email protected]>
1 parent 95fb578 commit abcc1ec

File tree

1 file changed

+35
-36
lines changed

1 file changed

+35
-36
lines changed

src/core/SkVM.cpp

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3356,10 +3356,9 @@ namespace skvm {
33563356

33573357
// Generally r(id),
33583358
// but with a hint, try to alias dst() to r(v) if dies_here(v).
3359-
auto dst = [&](Val hint = NA) -> Reg {
3360-
if (hint != NA) {
3361-
(void)try_alias(hint);
3362-
}
3359+
auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3360+
if (hint1 != NA && try_alias(hint1)) { return r(id); }
3361+
if (hint2 != NA && try_alias(hint2)) { return r(id); }
33633362
return r(id);
33643363
};
33653364

@@ -3746,13 +3745,13 @@ namespace skvm {
37463745
free_tmp(tmp);
37473746
} break;
37483747

3749-
case Op::store8: a->xtns2h(dst(), r(x));
3748+
case Op::store8: a->xtns2h(dst(x), r(x));
37503749
a->xtnh2b(dst(), dst());
37513750
if (scalar) { a->strb (dst(), arg[immy]); }
37523751
else { a->strs (dst(), arg[immy]); }
37533752
break;
37543753

3755-
case Op::store16: a->xtns2h(dst(), r(x));
3754+
case Op::store16: a->xtns2h(dst(x), r(x));
37563755
if (scalar) { a->strh (dst(), arg[immy]); }
37573756
else { a->strd (dst(), arg[immy]); }
37583757
break;
@@ -3840,7 +3839,7 @@ namespace skvm {
38403839
a->movs(GP1, r(x), i); // Extract index lane i into GP1.
38413840
a->add (GP1, GP0, GP1); // Add the gather base pointer.
38423841
a->ldrb(GP1, GP1); // Load that byte.
3843-
a->inss(dst(), GP1, i); // Insert it into dst() lane i.
3842+
a->inss(dst(x), GP1, i); // Insert it into dst() lane i.
38443843
}
38453844
} break;
38463845

@@ -3852,7 +3851,7 @@ namespace skvm {
38523851
a->movs(GP1, r(x), i);
38533852
a->add (GP1, GP0, GP1, A::LSL, 1); // Scale index 2x into a byte offset.
38543853
a->ldrh(GP1, GP1); // 2-byte load.
3855-
a->inss(dst(), GP1, i);
3854+
a->inss(dst(x), GP1, i);
38563855
}
38573856
} break;
38583857

@@ -3864,16 +3863,16 @@ namespace skvm {
38643863
a->movs(GP1, r(x), i);
38653864
a->add (GP1, GP0, GP1, A::LSL, 2); // Scale index 4x into a byte offset.
38663865
a->ldrs(GP1, GP1); // 4-byte load.
3867-
a->inss(dst(), GP1, i);
3866+
a->inss(dst(x), GP1, i);
38683867
}
38693868
} break;
38703869

3871-
case Op::add_f32: a->fadd4s(dst(), r(x), r(y)); break;
3872-
case Op::sub_f32: a->fsub4s(dst(), r(x), r(y)); break;
3873-
case Op::mul_f32: a->fmul4s(dst(), r(x), r(y)); break;
3874-
case Op::div_f32: a->fdiv4s(dst(), r(x), r(y)); break;
3870+
case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
3871+
case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
3872+
case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
3873+
case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
38753874

3876-
case Op::sqrt_f32: a->fsqrt4s(dst(), r(x)); break;
3875+
case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
38773876

38783877
case Op::fma_f32: // fmla.4s is z += x*y
38793878
if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
@@ -3894,21 +3893,21 @@ namespace skvm {
38943893
a->fneg4s(dst(), dst());
38953894
break;
38963895

3897-
case Op:: gt_f32: a->fcmgt4s (dst(), r(x), r(y)); break;
3898-
case Op::gte_f32: a->fcmge4s (dst(), r(x), r(y)); break;
3899-
case Op:: eq_f32: a->fcmeq4s (dst(), r(x), r(y)); break;
3900-
case Op::neq_f32: a->fcmeq4s (dst(), r(x), r(y));
3901-
a->not16b (dst(), dst()); break;
3896+
case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
3897+
case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
3898+
case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
3899+
case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
3900+
a->not16b (dst(), dst()); break;
39023901

39033902

3904-
case Op::add_i32: a->add4s(dst(), r(x), r(y)); break;
3905-
case Op::sub_i32: a->sub4s(dst(), r(x), r(y)); break;
3906-
case Op::mul_i32: a->mul4s(dst(), r(x), r(y)); break;
3903+
case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
3904+
case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
3905+
case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
39073906

3908-
case Op::bit_and : a->and16b(dst(), r(x), r(y)); break;
3909-
case Op::bit_or : a->orr16b(dst(), r(x), r(y)); break;
3910-
case Op::bit_xor : a->eor16b(dst(), r(x), r(y)); break;
3911-
case Op::bit_clear: a->bic16b(dst(), r(x), r(y)); break;
3907+
case Op::bit_and : a->and16b(dst(x,y), r(x), r(y)); break;
3908+
case Op::bit_or : a->orr16b(dst(x,y), r(x), r(y)); break;
3909+
case Op::bit_xor : a->eor16b(dst(x,y), r(x), r(y)); break;
3910+
case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
39123911

39133912
case Op::select: // bsl16b is x = x ? y : z
39143913
if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
@@ -3928,18 +3927,18 @@ namespace skvm {
39283927
a->bsl16b (dst(), r(y), r(x));
39293928
break;
39303929

3931-
case Op::shl_i32: a-> shl4s(dst(), r(x), immy); break;
3932-
case Op::shr_i32: a->ushr4s(dst(), r(x), immy); break;
3933-
case Op::sra_i32: a->sshr4s(dst(), r(x), immy); break;
3930+
case Op::shl_i32: a-> shl4s(dst(x), r(x), immy); break;
3931+
case Op::shr_i32: a->ushr4s(dst(x), r(x), immy); break;
3932+
case Op::sra_i32: a->sshr4s(dst(x), r(x), immy); break;
39343933

3935-
case Op::eq_i32: a->cmeq4s(dst(), r(x), r(y)); break;
3936-
case Op::gt_i32: a->cmgt4s(dst(), r(x), r(y)); break;
3934+
case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
3935+
case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
39373936

3938-
case Op::to_f32: a->scvtf4s (dst(), r(x)); break;
3939-
case Op::trunc: a->fcvtzs4s(dst(), r(x)); break;
3940-
case Op::round: a->fcvtns4s(dst(), r(x)); break;
3941-
case Op::ceil: a->frintp4s(dst(), r(x)); break;
3942-
case Op::floor: a->frintm4s(dst(), r(x)); break;
3937+
case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
3938+
case Op::trunc: a->fcvtzs4s(dst(x), r(x)); break;
3939+
case Op::round: a->fcvtns4s(dst(x), r(x)); break;
3940+
case Op::ceil: a->frintp4s(dst(x), r(x)); break;
3941+
case Op::floor: a->frintm4s(dst(x), r(x)); break;
39433942

39443943
case Op::to_fp16:
39453944
a->fcvtn (dst(x), r(x)); // 4x f32 -> 4x f16 in bottom four lanes

0 commit comments

Comments
 (0)