Skip to content

Commit 2c1b513

Browse files
committed
cmd/compile: add math/bits.{Add,Sub}64 intrinsics on s390x
This CL adds intrinsics for the 64-bit addition and subtraction functions in math/bits. These intrinsics use the condition code to propagate the carry or borrow bit. To make the carry chains more efficient I've removed the 'clobberFlags' property from most of the load and store operations. Originally these ops did clobber flags when using offsets that didn't fit in a signed 20-bit integer, however that is no longer true. As with other platforms the intrinsics are faster when executed in a chain rather than a loop because currently we need to spill and restore the carry bit between each loop iteration. We may be able to reduce the need to do this on s390x (e.g. by using compare-and-branch instructions that do not clobber flags) in the future. name old time/op new time/op delta Add64 1.21ns ± 2% 2.03ns ± 2% +67.18% (p=0.000 n=7+10) Add64multiple 2.98ns ± 3% 1.03ns ± 0% -65.39% (p=0.000 n=10+9) Sub64 1.23ns ± 4% 2.03ns ± 1% +64.85% (p=0.000 n=10+10) Sub64multiple 3.73ns ± 4% 1.04ns ± 1% -72.28% (p=0.000 n=10+8) Change-Id: I913bbd5e19e6b95bef52f5bc4f14d6fe40119083 Reviewed-on: https://go-review.googlesource.com/c/go/+/174303 Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent 004fb5c commit 2c1b513

File tree

9 files changed

+880
-195
lines changed

9 files changed

+880
-195
lines changed

src/cmd/asm/internal/asm/testdata/s390x.s

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
6666
ADD $32768, R1, R2 // b9040021c22800008000
6767
ADDC R1, R2 // b9ea1022
6868
ADDC $1, R1, R2 // ec21000100db
69+
ADDC $-1, R1, R2 // ec21ffff00db
6970
ADDC R1, R2, R3 // b9ea1032
7071
ADDW R1, R2 // 1a21
7172
ADDW R1, R2, R3 // b9f81032

src/cmd/compile/internal/gc/ssa.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,14 +3575,14 @@ func init() {
35753575
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
35763576
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
35773577
},
3578-
sys.AMD64, sys.ARM64, sys.PPC64)
3579-
alias("math/bits", "Add", "math/bits", "Add64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64)
3578+
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X)
3579+
alias("math/bits", "Add", "math/bits", "Add64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X)
35803580
addF("math/bits", "Sub64",
35813581
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
35823582
return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
35833583
},
3584-
sys.AMD64, sys.ARM64)
3585-
alias("math/bits", "Sub", "math/bits", "Sub64", sys.ArchAMD64, sys.ArchARM64)
3584+
sys.AMD64, sys.ARM64, sys.S390X)
3585+
alias("math/bits", "Sub", "math/bits", "Sub64", sys.ArchAMD64, sys.ArchARM64, sys.ArchS390X)
35863586
addF("math/bits", "Div64",
35873587
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
35883588
// check for divide-by-zero/overflow and panic with appropriate message

src/cmd/compile/internal/s390x/ssa.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,37 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
184184
if r != r1 {
185185
p.Reg = r1
186186
}
187+
case ssa.OpS390XADDC:
188+
r1 := v.Reg0()
189+
r2 := v.Args[0].Reg()
190+
r3 := v.Args[1].Reg()
191+
if r1 == r2 {
192+
r2, r3 = r3, r2
193+
}
194+
p := opregreg(s, v.Op.Asm(), r1, r2)
195+
if r3 != r1 {
196+
p.Reg = r3
197+
}
198+
case ssa.OpS390XSUBC:
199+
r1 := v.Reg0()
200+
r2 := v.Args[0].Reg()
201+
r3 := v.Args[1].Reg()
202+
p := opregreg(s, v.Op.Asm(), r1, r3)
203+
if r1 != r2 {
204+
p.Reg = r2
205+
}
206+
case ssa.OpS390XADDE, ssa.OpS390XSUBE:
207+
r1 := v.Reg0()
208+
if r1 != v.Args[0].Reg() {
209+
v.Fatalf("input[0] and output not in same register %s", v.LongString())
210+
}
211+
r2 := v.Args[1].Reg()
212+
opregreg(s, v.Op.Asm(), r1, r2)
213+
case ssa.OpS390XADDCconst:
214+
r1 := v.Reg0()
215+
r3 := v.Args[0].Reg()
216+
i2 := int64(int16(v.AuxInt))
217+
opregregimm(s, v.Op.Asm(), r1, r3, i2)
187218
// 2-address opcode arithmetic
188219
case ssa.OpS390XMULLD, ssa.OpS390XMULLW,
189220
ssa.OpS390XMULHD, ssa.OpS390XMULHDU,
@@ -553,7 +584,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
553584
p.To.Reg = v.Reg()
554585
case ssa.OpS390XInvertFlags:
555586
v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
556-
case ssa.OpS390XFlagEQ, ssa.OpS390XFlagLT, ssa.OpS390XFlagGT:
587+
case ssa.OpS390XFlagEQ, ssa.OpS390XFlagLT, ssa.OpS390XFlagGT, ssa.OpS390XFlagOV:
557588
v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
558589
case ssa.OpS390XAddTupleFirst32, ssa.OpS390XAddTupleFirst64:
559590
v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())

src/cmd/compile/internal/ssa/gen/S390X.rules

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,18 @@
119119
(Bswap64 x) -> (MOVDBR x)
120120
(Bswap32 x) -> (MOVWBR x)
121121

122+
// add with carry
123+
(Select0 (Add64carry x y c))
124+
-> (Select0 <typ.UInt64> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))
125+
(Select1 (Add64carry x y c))
126+
-> (Select0 <typ.UInt64> (ADDE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (ADDE x y (Select1 <types.TypeFlags> (ADDCconst c [-1]))))))
127+
128+
// subtract with borrow
129+
(Select0 (Sub64borrow x y c))
130+
-> (Select0 <typ.UInt64> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c))))
131+
(Select1 (Sub64borrow x y c))
132+
-> (NEG (Select0 <typ.UInt64> (SUBE (MOVDconst [0]) (MOVDconst [0]) (Select1 <types.TypeFlags> (SUBE x y (Select1 <types.TypeFlags> (SUBC (MOVDconst [0]) c)))))))
133+
122134
// math package intrinsics
123135
(Sqrt x) -> (FSQRT x)
124136
(Floor x) -> (FIDBR [7] x)
@@ -1121,6 +1133,43 @@
11211133
(MOVBreg (ANDWconst [m] x)) && int8(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64( uint8(m))] x))
11221134
(MOVHreg (ANDWconst [m] x)) && int16(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(uint16(m))] x))
11231135

1136+
// carry flag generation
1137+
// (only constant fold carry of zero)
1138+
(Select1 (ADDCconst (MOVDconst [c]) [d]))
1139+
&& uint64(c+d) >= uint64(c) && c+d == 0
1140+
-> (FlagEQ)
1141+
(Select1 (ADDCconst (MOVDconst [c]) [d]))
1142+
&& uint64(c+d) >= uint64(c) && c+d != 0
1143+
-> (FlagLT)
1144+
1145+
// borrow flag generation
1146+
// (only constant fold borrow of zero)
1147+
(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
1148+
&& uint64(d) <= uint64(c) && c-d == 0
1149+
-> (FlagGT)
1150+
(Select1 (SUBC (MOVDconst [c]) (MOVDconst [d])))
1151+
&& uint64(d) <= uint64(c) && c-d != 0
1152+
-> (FlagOV)
1153+
1154+
// add with carry
1155+
(ADDE x y (FlagEQ)) -> (ADDC x y)
1156+
(ADDE x y (FlagLT)) -> (ADDC x y)
1157+
(ADDC x (MOVDconst [c])) && is16Bit(c) -> (ADDCconst x [c])
1158+
(Select0 (ADDCconst (MOVDconst [c]) [d])) -> (MOVDconst [c+d])
1159+
1160+
// subtract with borrow
1161+
(SUBE x y (FlagGT)) -> (SUBC x y)
1162+
(SUBE x y (FlagOV)) -> (SUBC x y)
1163+
(Select0 (SUBC (MOVDconst [c]) (MOVDconst [d]))) -> (MOVDconst [c-d])
1164+
1165+
// collapse carry chain
1166+
(ADDE x y (Select1 (ADDCconst [-1] (Select0 (ADDE (MOVDconst [0]) (MOVDconst [0]) c)))))
1167+
-> (ADDE x y c)
1168+
1169+
// collapse borrow chain
1170+
(SUBE x y (Select1 (SUBC (MOVDconst [0]) (NEG (Select0 (SUBE (MOVDconst [0]) (MOVDconst [0]) c))))))
1171+
-> (SUBE x y c)
1172+
11241173
// fused multiply-add
11251174
(FADD (FMUL y z) x) -> (FMADD x y z)
11261175
(FADDS (FMULS y z) x) -> (FMADDS x y z)

0 commit comments

Comments
 (0)