Skip to content

Commit c18ff29

Browse files
committed
cmd/compile: make sync/atomic AND/OR operations intrinsic on amd64
Update #61395 Change-Id: I59a950f48efc587dfdffce00e2f4f3ab99d8df00 Reviewed-on: https://go-review.googlesource.com/c/go/+/594738 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Nicolas Hillegeer <[email protected]>
1 parent dbfa3ca commit c18ff29

File tree

8 files changed

+478
-14
lines changed

8 files changed

+478
-14
lines changed

src/cmd/compile/internal/amd64/ssa.go

+56-1
Original file line numberDiff line numberDiff line change
@@ -1286,14 +1286,69 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
12861286
p = s.Prog(x86.ASETEQ)
12871287
p.To.Type = obj.TYPE_REG
12881288
p.To.Reg = v.Reg0()
1289-
case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock:
1289+
case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
1290+
// Atomic memory operations that don't need to return the old value.
12901291
s.Prog(x86.ALOCK)
12911292
p := s.Prog(v.Op.Asm())
12921293
p.From.Type = obj.TYPE_REG
12931294
p.From.Reg = v.Args[1].Reg()
12941295
p.To.Type = obj.TYPE_MEM
12951296
p.To.Reg = v.Args[0].Reg()
12961297
ssagen.AddAux(&p.To, v)
1298+
case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
1299+
// Atomic memory operations that need to return the old value.
1300+
// We need to do these with compare-and-exchange to get access to the old value.
1301+
// loop:
1302+
// MOVQ mask, tmp
1303+
// MOVQ (addr), AX
1304+
// ANDQ AX, tmp
1305+
// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
1306+
// JNE loop
1307+
// : result in AX
1308+
mov := x86.AMOVQ
1309+
op := x86.AANDQ
1310+
cmpxchg := x86.ACMPXCHGQ
1311+
switch v.Op {
1312+
case ssa.OpAMD64LoweredAtomicOr64:
1313+
op = x86.AORQ
1314+
case ssa.OpAMD64LoweredAtomicAnd32:
1315+
mov = x86.AMOVL
1316+
op = x86.AANDL
1317+
cmpxchg = x86.ACMPXCHGL
1318+
case ssa.OpAMD64LoweredAtomicOr32:
1319+
mov = x86.AMOVL
1320+
op = x86.AORL
1321+
cmpxchg = x86.ACMPXCHGL
1322+
}
1323+
addr := v.Args[0].Reg()
1324+
mask := v.Args[1].Reg()
1325+
tmp := v.RegTmp()
1326+
p1 := s.Prog(mov)
1327+
p1.From.Type = obj.TYPE_REG
1328+
p1.From.Reg = mask
1329+
p1.To.Type = obj.TYPE_REG
1330+
p1.To.Reg = tmp
1331+
p2 := s.Prog(mov)
1332+
p2.From.Type = obj.TYPE_MEM
1333+
p2.From.Reg = addr
1334+
ssagen.AddAux(&p2.From, v)
1335+
p2.To.Type = obj.TYPE_REG
1336+
p2.To.Reg = x86.REG_AX
1337+
p3 := s.Prog(op)
1338+
p3.From.Type = obj.TYPE_REG
1339+
p3.From.Reg = x86.REG_AX
1340+
p3.To.Type = obj.TYPE_REG
1341+
p3.To.Reg = tmp
1342+
s.Prog(x86.ALOCK)
1343+
p5 := s.Prog(cmpxchg)
1344+
p5.From.Type = obj.TYPE_REG
1345+
p5.From.Reg = tmp
1346+
p5.To.Type = obj.TYPE_MEM
1347+
p5.To.Reg = addr
1348+
ssagen.AddAux(&p5.To, v)
1349+
p6 := s.Prog(x86.AJNE)
1350+
p6.To.Type = obj.TYPE_BRANCH
1351+
p6.To.SetTarget(p1)
12971352
case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
12981353
p := s.Prog(v.Op.Asm())
12991354
p.From.Type = obj.TYPE_MEM

src/cmd/compile/internal/ssa/_gen/AMD64.rules

+7-1
Original file line numberDiff line numberDiff line change
@@ -578,12 +578,15 @@
578578
(AtomicCompareAndSwap32 ptr old new_ mem) => (CMPXCHGLlock ptr old new_ mem)
579579
(AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem)
580580

581-
// Atomic memory updates.
581+
// Atomic memory logical operations (old style).
582582
(AtomicAnd8 ptr val mem) => (ANDBlock ptr val mem)
583583
(AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem)
584584
(AtomicOr8 ptr val mem) => (ORBlock ptr val mem)
585585
(AtomicOr32 ptr val mem) => (ORLlock ptr val mem)
586586

587+
// Atomic memory logical operations (new style).
588+
(Atomic(And64|And32|Or64|Or32)value ptr val mem) => (LoweredAtomic(And64|And32|Or64|Or32) ptr val mem)
589+
587590
// Write barrier.
588591
(WB ...) => (LoweredWB ...)
589592

@@ -1697,3 +1700,6 @@
16971700
((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVQconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem))
16981701
((SHL|SHR|SAR)XQload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Qconst [int8(c&63)] (MOVQload [off] {sym} ptr mem))
16991702
((SHL|SHR|SAR)XLload [off] {sym} ptr (MOVLconst [c]) mem) => ((SHL|SHR|SAR)Lconst [int8(c&31)] (MOVLload [off] {sym} ptr mem))
1703+
1704+
// Convert atomic logical operations to easier ones if we don't use the result.
1705+
(Select1 a:(LoweredAtomic(And64|And32|Or64|Or32) ptr val mem)) && a.Uses == 1 && clobber(a) => ((ANDQ|ANDL|ORQ|ORL)lock ptr val mem)

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

+13-1
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ func init() {
152152
gpstoreconstidx = regInfo{inputs: []regMask{gpspsbg, gpsp, 0}}
153153
gpstorexchg = regInfo{inputs: []regMask{gp, gpspsbg, 0}, outputs: []regMask{gp}}
154154
cmpxchg = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax}
155+
atomicLogic = regInfo{inputs: []regMask{gp &^ ax, gp &^ ax, 0}, outputs: []regMask{ax, 0}}
155156

156157
fp01 = regInfo{inputs: nil, outputs: fponly}
157158
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
@@ -1040,11 +1041,22 @@ func init() {
10401041
{name: "CMPXCHGLlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
10411042
{name: "CMPXCHGQlock", argLength: 4, reg: cmpxchg, asm: "CMPXCHGQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},
10421043

1043-
// Atomic memory updates.
1044+
// Atomic memory updates using logical operations.
1045+
// Old style that just returns the memory state.
10441046
{name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
10451047
{name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
1048+
{name: "ANDQlock", argLength: 3, reg: gpstore, asm: "ANDQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
10461049
{name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1
10471050
{name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1
1051+
{name: "ORQlock", argLength: 3, reg: gpstore, asm: "ORQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) |= arg1
1052+
1053+
// Atomic memory updates using logical operations.
1054+
// *(arg0+auxint+aux) op= arg1. arg2=mem.
1055+
// New style that returns a tuple of <old contents of *(arg0+auxint+aux), memory>.
1056+
{name: "LoweredAtomicAnd64", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ANDQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true},
1057+
{name: "LoweredAtomicAnd32", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true},
1058+
{name: "LoweredAtomicOr64", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ORQ", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true},
1059+
{name: "LoweredAtomicOr32", argLength: 3, reg: atomicLogic, resultNotInArgs: true, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr", unsafePoint: true, needIntTemp: true},
10481060

10491061
// Prefetch instructions
10501062
// Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint

src/cmd/compile/internal/ssa/opGen.go

+130
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/regalloc.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -1612,8 +1612,14 @@ func (s *regAllocState) regalloc(f *Func) {
16121612
// allocate it after all the input registers, but before
16131613
// the input registers are freed via advanceUses below.
16141614
// (Not all instructions need that distinct part, but it is conservative.)
1615+
// We also ensure it is not any of the single-choice output registers.
16151616
if opcodeTable[v.Op].needIntTemp {
16161617
m := s.allocatable & s.f.Config.gpRegMask
1618+
for _, out := range regspec.outputs {
1619+
if countRegs(out.regs) == 1 {
1620+
m &^= out.regs
1621+
}
1622+
}
16171623
if m&^desired.avoid&^s.nospill != 0 {
16181624
m &^= desired.avoid
16191625
}
@@ -1651,9 +1657,12 @@ func (s *regAllocState) regalloc(f *Func) {
16511657
used |= regMask(1) << tmpReg
16521658
}
16531659
for _, out := range regspec.outputs {
1660+
if out.regs == 0 {
1661+
continue
1662+
}
16541663
mask := out.regs & s.allocatable &^ used
16551664
if mask == 0 {
1656-
continue
1665+
s.f.Fatalf("can't find any output register %s", v.LongString())
16571666
}
16581667
if opcodeTable[v.Op].resultInArg0 && out.idx == 0 {
16591668
if !opcodeTable[v.Op].commutative {

0 commit comments

Comments
 (0)