Skip to content

Commit b3885db

Browse files
mundaymbradfitz
authored andcommitted
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the sequence of instructions used in the assembly implementations to match the intrinsics. Also, add a micro benchmark so we can more easily measure the performance of these two functions: name old time/op new time/op delta And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20) And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20) Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20) Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20) By using a 'rotate then xor selected bits' instruction combined with either a 'load and and' or a 'load and or' instruction we can implement And8 and Or8 with far fewer instructions. Replacing 'compare and swap' with atomic instructions may also improve performance when there is contention. Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01 Reviewed-on: https://go-review.googlesource.com/c/go/+/204277 Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 75c839a commit b3885db

File tree

9 files changed

+305
-29
lines changed

9 files changed

+305
-29
lines changed

src/cmd/compile/internal/gc/ssa.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3490,13 +3490,13 @@ func init() {
34903490
s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
34913491
return nil
34923492
},
3493-
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64)
3493+
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
34943494
addF("runtime/internal/atomic", "Or8",
34953495
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
34963496
s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
34973497
return nil
34983498
},
3499-
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64)
3499+
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
35003500

35013501
alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...)
35023502
alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...)

src/cmd/compile/internal/s390x/ssa.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
173173
if r != r1 {
174174
p.Reg = r1
175175
}
176+
case ssa.OpS390XRXSBG:
177+
r1 := v.Reg()
178+
if r1 != v.Args[0].Reg() {
179+
v.Fatalf("input[0] and output not in same register %s", v.LongString())
180+
}
181+
r2 := v.Args[1].Reg()
182+
i := v.Aux.(s390x.RotateParams)
183+
p := s.Prog(v.Op.Asm())
184+
p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(i.Start)}
185+
p.RestArgs = []obj.Addr{
186+
{Type: obj.TYPE_CONST, Offset: int64(i.End)},
187+
{Type: obj.TYPE_CONST, Offset: int64(i.Amount)},
188+
{Type: obj.TYPE_REG, Reg: r2},
189+
}
190+
p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1}
176191
case ssa.OpS390XADD, ssa.OpS390XADDW,
177192
ssa.OpS390XSUB, ssa.OpS390XSUBW,
178193
ssa.OpS390XAND, ssa.OpS390XANDW,
@@ -736,6 +751,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
736751
p.To.Type = obj.TYPE_MEM
737752
p.To.Reg = v.Args[0].Reg()
738753
gc.AddAux(&p.To, v)
754+
case ssa.OpS390XLANfloor, ssa.OpS390XLAOfloor:
755+
r := v.Args[0].Reg() // clobbered, assumed R1 in comments
756+
757+
// Round ptr down to nearest multiple of 4.
758+
// ANDW $~3, R1
759+
ptr := s.Prog(s390x.AANDW)
760+
ptr.From.Type = obj.TYPE_CONST
761+
ptr.From.Offset = 0xfffffffc
762+
ptr.To.Type = obj.TYPE_REG
763+
ptr.To.Reg = r
764+
765+
// Redirect output of LA(N|O) into R1 since it is clobbered anyway.
766+
// LA(N|O) Rx, R1, 0(R1)
767+
op := s.Prog(v.Op.Asm())
768+
op.From.Type = obj.TYPE_REG
769+
op.From.Reg = v.Args[1].Reg()
770+
op.Reg = r
771+
op.To.Type = obj.TYPE_MEM
772+
op.To.Reg = r
739773
case ssa.OpS390XLAA, ssa.OpS390XLAAG:
740774
p := s.Prog(v.Op.Asm())
741775
p.Reg = v.Reg0()

src/cmd/compile/internal/ssa/gen/S390X.rules

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,36 @@
167167
(AtomicCompareAndSwap32 ptr old new_ mem) -> (LoweredAtomicCas32 ptr old new_ mem)
168168
(AtomicCompareAndSwap64 ptr old new_ mem) -> (LoweredAtomicCas64 ptr old new_ mem)
169169

170+
// Atomic and: *(*uint8)(ptr) &= val
171+
//
172+
// Round pointer down to nearest word boundary and pad value with ones before
173+
// applying atomic AND operation to target word.
174+
//
175+
// *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3))
176+
//
177+
(AtomicAnd8 ptr val mem)
178+
-> (LANfloor
179+
ptr
180+
(RLL <typ.UInt32>
181+
(ORWconst <typ.UInt32> val [-1<<8])
182+
(RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
183+
mem)
184+
185+
// Atomic or: *(*uint8)(ptr) |= val
186+
//
187+
// Round pointer down to nearest word boundary and pad value with zeros before
188+
// applying atomic OR operation to target word.
189+
//
190+
// *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3))
191+
//
192+
(AtomicOr8 ptr val mem)
193+
-> (LAOfloor
194+
ptr
195+
(SLW <typ.UInt32>
196+
(MOVBZreg <typ.UInt32> val)
197+
(RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
198+
mem)
199+
170200
// Lowering extension
171201
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
172202
(SignExt8to(16|32|64) x) -> (MOVBreg x)

src/cmd/compile/internal/ssa/gen/S390XOps.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ func init() {
170170
gpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, gpsp, 0}}
171171
gpstorebr = regInfo{inputs: []regMask{ptrsp, gpsp, 0}}
172172
gpstorelaa = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}, outputs: gponly}
173+
gpstorelab = regInfo{inputs: []regMask{r1, gpsp, 0}, clobbers: r1}
173174

174175
gpmvc = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}}
175176

@@ -347,6 +348,27 @@ func init() {
347348
{name: "RLLGconst", argLength: 1, reg: gp11, asm: "RLLG", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-63
348349
{name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-31
349350

351+
// Rotate then (and|or|xor|insert) selected bits instructions.
352+
//
353+
// Aux is an s390x.RotateParams struct containing Start, End and rotation
354+
// Amount fields.
355+
//
356+
// arg1 is rotated left by the rotation amount then the bits from the start
357+
// bit to the end bit (inclusive) are combined with arg0 using the logical
358+
// operation specified. Bit indices are specified from left to right - the
359+
// MSB is 0 and the LSB is 63.
360+
//
361+
// Examples:
362+
// | aux |
363+
// | instruction | start | end | amount | arg0 | arg1 | result |
364+
// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
365+
// | RXSBG (XOR) | 0 | 1 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0x3fff_ffff_ffff_ffff |
366+
// | RXSBG (XOR) | 62 | 63 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_fffc |
367+
// | RXSBG (XOR) | 0 | 47 | 16 | 0xffff_ffff_ffff_ffff | 0x0000_0000_0000_ffff | 0xffff_ffff_0000_ffff |
368+
// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
369+
//
370+
{name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "ArchSpecific", clobberFlags: true}, // rotate then xor selected bits
371+
350372
// unary ops
351373
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true}, // -arg0
352374
{name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW", clobberFlags: true}, // -arg0
@@ -509,6 +531,12 @@ func init() {
509531
{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
510532
{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
511533

534+
// Atomic bitwise operations.
535+
// Note: 'floor' operations round the pointer down to the nearest word boundary
536+
// which reflects how they are used in the runtime.
537+
{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
538+
{name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
539+
512540
// Compare and swap.
513541
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
514542
// if *(arg0+auxint+aux) == arg1 {

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 48 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteS390X.go

Lines changed: 59 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/internal/obj/s390x/rotate.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Copyright 2019 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package s390x
6+
7+
// RotateParams represents the immediates required for a "rotate
8+
// then ... selected bits instruction".
9+
//
10+
// The Start and End values are the indexes that represent
11+
// the masked region. They are inclusive and are in big-
12+
// endian order (bit 0 is the MSB, bit 63 is the LSB). They
13+
// may wrap around.
14+
//
15+
// Some examples:
16+
//
17+
// Masked region | Start | End
18+
// --------------------------+-------+----
19+
// 0x00_00_00_00_00_00_00_0f | 60 | 63
20+
// 0xf0_00_00_00_00_00_00_00 | 0 | 3
21+
// 0xf0_00_00_00_00_00_00_0f | 60 | 3
22+
//
23+
// The Amount value represents the amount to rotate the
24+
// input left by. Note that this rotation is performed
25+
// before the masked region is used.
26+
type RotateParams struct {
27+
Start uint8 // big-endian start bit index [0..63]
28+
End uint8 // big-endian end bit index [0..63]
29+
Amount uint8 // amount to rotate left
30+
}
31+
32+
func NewRotateParams(start, end, amount int64) RotateParams {
33+
if start&^63 != 0 {
34+
panic("start out of bounds")
35+
}
36+
if end&^63 != 0 {
37+
panic("end out of bounds")
38+
}
39+
if amount&^63 != 0 {
40+
panic("amount out of bounds")
41+
}
42+
return RotateParams{
43+
Start: uint8(start),
44+
End: uint8(end),
45+
Amount: uint8(amount),
46+
}
47+
}

0 commit comments

Comments
 (0)