Skip to content

Commit dce30a1

Browse files
prattmicgopherbot
authored andcommitted
cmd/compile: intrinsify swissmap match calls with SIMD on amd64
Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Michael Pratt <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent b8ba5b4 commit dce30a1

File tree

8 files changed

+1858
-1287
lines changed

8 files changed

+1858
-1287
lines changed

src/cmd/compile/internal/amd64/ssa.go

+32-2
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,39 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
256256
ssa.OpAMD64POR, ssa.OpAMD64PXOR,
257257
ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
258258
ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
259-
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
259+
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
260+
ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
261+
ssa.OpAMD64PUNPCKLBW:
260262
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
261263

264+
case ssa.OpAMD64PSHUFLW:
265+
p := s.Prog(v.Op.Asm())
266+
imm := v.AuxInt
267+
if imm < 0 || imm > 255 {
268+
v.Fatalf("Invalid source selection immediate")
269+
}
270+
p.From.Offset = imm
271+
p.From.Type = obj.TYPE_CONST
272+
p.AddRestSourceReg(v.Args[0].Reg())
273+
p.To.Type = obj.TYPE_REG
274+
p.To.Reg = v.Reg()
275+
276+
case ssa.OpAMD64PSHUFBbroadcast:
277+
// PSHUFB with a control mask of zero copies byte 0 to all
278+
// bytes in the register.
279+
//
280+
// X15 is always zero with ABIInternal.
281+
if s.ABI != obj.ABIInternal {
282+
// zero X15 manually
283+
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
284+
}
285+
286+
p := s.Prog(v.Op.Asm())
287+
p.From.Type = obj.TYPE_REG
288+
p.To.Type = obj.TYPE_REG
289+
p.To.Reg = v.Reg()
290+
p.From.Reg = x86.REG_X15
291+
262292
case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
263293
p := s.Prog(v.Op.Asm())
264294
lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
@@ -915,7 +945,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
915945
ssagen.AddAux2(&p.To, v, sc.Off64())
916946
case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
917947
ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
918-
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
948+
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
919949
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
920950
case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
921951
r := v.Reg()

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

+54
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,60 @@ func init() {
11341134
{name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
11351135
{name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
11361136
{name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
1137+
1138+
// Unpack bytes, low 64-bits.
1139+
//
1140+
// Input/output registers treated as [8]uint8.
1141+
//
1142+
// output = {in1[0], in2[0], in1[1], in2[1], in1[2], in2[2], in1[3], in2[3]}
1143+
{name: "PUNPCKLBW", argLength: 2, reg: fp21, resultInArg0: true, asm: "PUNPCKLBW"},
1144+
1145+
// Shuffle 16-bit words, low 64-bits.
1146+
//
1147+
// Input/output registers treated as [4]uint16.
1148+
// aux=source word index for each destination word, 2 bits per index.
1149+
//
1150+
// output[i] = input[(aux>>2*i)&3].
1151+
{name: "PSHUFLW", argLength: 1, reg: fp11, aux: "Int8", asm: "PSHUFLW"},
1152+
1153+
// Broadcast input byte.
1154+
//
1155+
// Input treated as uint8, output treated as [16]uint8.
1156+
//
1157+
// output[i] = input.
1158+
{name: "PSHUFBbroadcast", argLength: 1, reg: fp11, resultInArg0: true, asm: "PSHUFB"}, // PSHUFB with mask zero, (GOAMD64=v1)
1159+
{name: "VPBROADCASTB", argLength: 1, reg: gpfp, asm: "VPBROADCASTB"}, // Broadcast input byte from gp (GOAMD64=v3)
1160+
1161+
// Byte negate/zero/preserve (GOAMD64=v2).
1162+
//
1163+
// Input/output registers treated as [16]uint8.
1164+
//
1165+
// if in2[i] > 0 {
1166+
// output[i] = in1[i]
1167+
// } else if in2[i] == 0 {
1168+
// output[i] = 0
1169+
// } else {
1170+
// output[i] = -1 * in1[i]
1171+
// }
1172+
{name: "PSIGNB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PSIGNB"},
1173+
1174+
// Byte compare.
1175+
//
1176+
// Input/output registers treated as [16]uint8.
1177+
//
1178+
// if in1[i] == in2[i] {
1179+
// output[i] = 0xff
1180+
// } else {
1181+
// output[i] = 0
1182+
// }
1183+
{name: "PCMPEQB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PCMPEQB"},
1184+
1185+
// Byte sign mask. Output is a bitmap of sign bits from each input byte.
1186+
//
1187+
// Input treated as [16]uint8. Output is [16]bit (uint16 bitmap).
1188+
//
1189+
// output[i] = (input[i] >> 7) & 1
1190+
{name: "PMOVMSKB", argLength: 1, reg: fpgp, asm: "PMOVMSKB"},
11371191
}
11381192

11391193
var AMD64blocks = []blockData{

src/cmd/compile/internal/ssa/opGen.go

+106
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)