Skip to content

Commit b211fe0

Browse files
randall77benshi001
authored andcommitted
cmd/compile: remove bit operations that modify memory directly
These operations (BT{S,R,C}{Q,L}modify) are quite a bit slower than other ways of doing the same thing. Without the BTxmodify operations, there are two fallback ways the compiler performs these operations: AND/OR/XOR operations directly on memory, or load-BTx-write sequences. The compiler kinda chooses one arbitrarily depending on rewrite rule application order. Currently, it uses load-BTx-write for the Const benchmarks and AND/OR/XOR directly to memory for the non-Const benchmarks. TBD, someone might investigate which of the two fallback strategies is really better. For now, they are both better than BTx ops. name old time/op new time/op delta BitSet-8 1.09µs ± 2% 0.64µs ± 5% -41.60% (p=0.000 n=9+10) BitClear-8 1.15µs ± 3% 0.68µs ± 6% -41.00% (p=0.000 n=10+10) BitToggle-8 1.18µs ± 4% 0.73µs ± 2% -38.36% (p=0.000 n=10+8) BitSetConst-8 37.0ns ± 7% 25.8ns ± 2% -30.24% (p=0.000 n=10+10) BitClearConst-8 30.7ns ± 2% 25.0ns ±12% -18.46% (p=0.000 n=10+10) BitToggleConst-8 36.9ns ± 1% 23.8ns ± 3% -35.46% (p=0.000 n=9+10) Fixes #45790 Update #45242 Change-Id: Ie33a72dc139f261af82db15d446cd0855afb4e59 Reviewed-on: https://go-review.googlesource.com/c/go/+/318149 Trust: Keith Randall <[email protected]> Run-TryBot: Keith Randall <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Ben Shi <[email protected]>
1 parent f24eac4 commit b211fe0

File tree

7 files changed

+127
-1431
lines changed

7 files changed

+127
-1431
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,6 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
756756
p.To.Type = obj.TYPE_REG
757757
p.To.Reg = v.Reg()
758758
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
759-
ssa.OpAMD64BTCQmodify, ssa.OpAMD64BTCLmodify, ssa.OpAMD64BTRQmodify, ssa.OpAMD64BTRLmodify, ssa.OpAMD64BTSQmodify, ssa.OpAMD64BTSLmodify,
760759
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
761760
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
762761
p := s.Prog(v.Op.Asm())
@@ -804,8 +803,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
804803
}
805804
fallthrough
806805
case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
807-
ssa.OpAMD64BTCQconstmodify, ssa.OpAMD64BTCLconstmodify, ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTSLconstmodify,
808-
ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTRLconstmodify, ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
806+
ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
809807
sc := v.AuxValAndOff()
810808
off := sc.Off64()
811809
val := sc.Val64()

src/cmd/compile/internal/ssa/gen/AMD64.rules

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -624,14 +624,6 @@
624624
// Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b)
625625
(OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y)
626626
(XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y)
627-
(ORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
628-
(BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
629-
(ORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
630-
(BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
631-
(XORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) =>
632-
(BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
633-
(XORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) =>
634-
(BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
635627

636628
// Convert ORconst into BTS, if the code gets smaller, with boundary being
637629
// (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes).
@@ -654,10 +646,6 @@
654646
=> (BTRQconst [int8(log64(^c))] x)
655647
(ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
656648
=> (BTRLconst [int8(log32(^c))] x)
657-
(ANDLmodify [off] {sym} ptr (NOTL s:(SHLL (MOVLconst [1]) <t> x)) mem) =>
658-
(BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
659-
(ANDQmodify [off] {sym} ptr (NOTQ s:(SHLQ (MOVQconst [1]) <t> x)) mem) =>
660-
(BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
661649

662650
// Special-case bit patterns on first/last bit.
663651
// generic.rules changes ANDs of high-part/low-part masks into a couple of shifts,
@@ -1126,14 +1114,14 @@
11261114
((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
11271115
((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
11281116
((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
1129-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
1130-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
1131-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
1132-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
1133-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
1134-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {sym} base val mem)
1135-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
1136-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {sym} base val mem)
1117+
((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
1118+
((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
1119+
((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) =>
1120+
((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem)
1121+
((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
1122+
((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {sym} base val mem)
1123+
((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
1124+
((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
11371125

11381126
// Fold constants into stores.
11391127
(MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validVal(c) =>
@@ -1181,18 +1169,18 @@
11811169
((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
11821170
&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
11831171
((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
1184-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
1172+
((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
11851173
&& ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
1186-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
1187-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
1174+
((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
1175+
((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
11881176
&& ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) =>
1189-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
1190-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
1177+
((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem)
1178+
((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
11911179
&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
1192-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
1193-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
1180+
((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
1181+
((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem)
11941182
&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
1195-
((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
1183+
((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
11961184

11971185
// fold LEAQs together
11981186
(LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
@@ -2078,13 +2066,9 @@
20782066
(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
20792067
(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
20802068
((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
2081-
(MOVLstore {sym} [off] ptr y:((BTC|BTR|BTS)L l:(MOVLload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
2082-
((BTC|BTR|BTS)Lmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem)
20832069
(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
20842070
(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
20852071
((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
2086-
(MOVQstore {sym} [off] ptr y:((BTC|BTR|BTS)Q l:(MOVQload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
2087-
((BTC|BTR|BTS)Qmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem)
20882072

20892073
// Merge ADDQconst and LEAQ into atomic loads.
20902074
(MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
@@ -2138,12 +2122,12 @@
21382122
(MOVWQZX (MOVBQZX x)) => (MOVBQZX x)
21392123
(MOVBQZX (MOVBQZX x)) => (MOVBQZX x)
21402124

2141-
(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem)
2125+
(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem)
21422126
&& isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) =>
2143-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
2144-
(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem)
2127+
((ADD|AND|OR|XOR)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
2128+
(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem)
21452129
&& isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) =>
2146-
((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
2130+
((ADD|AND|OR|XOR)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)
21472131

21482132
// float <-> int register moves, with no conversion.
21492133
// These come up when compiling math.{Float{32,64}bits,Float{32,64}frombits}.

src/cmd/compile/internal/ssa/gen/AMD64Ops.go

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -362,25 +362,6 @@ func init() {
362362
{name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32
363363
{name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64
364364

365-
// direct bit operation on memory operand
366-
//
367-
// Note that these operations do not mask the bit offset (arg1), and will write beyond their expected
368-
// bounds if that argument is larger than 64/32 (for BT*Q and BT*L, respectively). If the compiler
369-
// cannot prove that arg1 is in range, it must be explicitly masked (see e.g. the patterns that produce
370-
// BT*modify from (MOVstore (BT* (MOVLload ptr mem) x) mem)).
371-
{name: "BTCQmodify", argLength: 3, reg: gpstore, asm: "BTCQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
372-
{name: "BTCLmodify", argLength: 3, reg: gpstore, asm: "BTCL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
373-
{name: "BTSQmodify", argLength: 3, reg: gpstore, asm: "BTSQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
374-
{name: "BTSLmodify", argLength: 3, reg: gpstore, asm: "BTSL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
375-
{name: "BTRQmodify", argLength: 3, reg: gpstore, asm: "BTRQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit arg1 in 64-bit arg0+auxint+aux, arg2=mem
376-
{name: "BTRLmodify", argLength: 3, reg: gpstore, asm: "BTRL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit arg1 in 32-bit arg0+auxint+aux, arg2=mem
377-
{name: "BTCQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
378-
{name: "BTCLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
379-
{name: "BTSQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
380-
{name: "BTSLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
381-
{name: "BTRQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
382-
{name: "BTRLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
383-
384365
{name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"}, // (arg0 & arg1) compare to 0
385366
{name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0
386367
{name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 0 additions & 186 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)