Skip to content

Commit 98aa978

Browse files
Ruixin Baomundaym
Ruixin Bao
authored andcommitted
cmd/compile: add math/bits.Mul64 intrinsic on s390x
This change adds an intrinsic for Mul64 on s390x. To achieve that, a new assembly instruction, MLGR, is introduced in s390x/asmz.go. This assembly instruction directly uses an existing instruction on Z and supports multiplication of two 64 bit unsigned integer and stores the result in two separate registers. In this case, we require the multiplcand to be stored in register R3 and the output result (the high and low 64 bit of the product) to be stored in R2 and R3 respectively. A test case is also added. Benchmark: name old time/op new time/op delta Mul-18 11.1ns ± 0% 1.4ns ± 0% -87.39% (p=0.002 n=8+10) Mul32-18 2.07ns ± 0% 2.07ns ± 0% ~ (all equal) Mul64-18 11.1ns ± 1% 1.4ns ± 0% -87.42% (p=0.000 n=10+10) Change-Id: Ieca6ad1f61fff9a48a31d50bbd3f3c6d9e6675c1 Reviewed-on: https://go-review.googlesource.com/c/go/+/194572 Reviewed-by: Michael Munday <[email protected]> Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 03f6365 commit 98aa978

File tree

11 files changed

+69
-2
lines changed

11 files changed

+69
-2
lines changed

src/cmd/asm/internal/asm/testdata/s390x.s

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
109109
MULHD R7, R2, R1 // b90400b2b98600a7ebb7003f000ab98000b2b90900abebb2003f000ab98000b7b9e9b01a
110110
MULHDU R3, R4 // b90400b4b98600a3b904004a
111111
MULHDU R5, R6, R7 // b90400b6b98600a5b904007a
112+
MLGR R1, R2 // b9860021
112113
DIVD R1, R2 // b90400b2b90d00a1b904002b
113114
DIVD R1, R2, R3 // b90400b2b90d00a1b904003b
114115
DIVW R4, R5 // b90400b5b91d00a4b904005b

src/cmd/compile/internal/gc/ssa.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -3600,8 +3600,8 @@ func init() {
36003600
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
36013601
return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1])
36023602
},
3603-
sys.AMD64, sys.ARM64, sys.PPC64)
3604-
alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64)
3603+
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X)
3604+
alias("math/bits", "Mul", "math/bits", "Mul64", sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64, sys.ArchS390X)
36053605
addF("math/bits", "Add64",
36063606
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
36073607
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])

src/cmd/compile/internal/s390x/ssa.go

+13
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
225225
v.Fatalf("input[0] and output not in same register %s", v.LongString())
226226
}
227227
opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
228+
case ssa.OpS390XMLGR:
229+
// MLGR Rx R3 -> R2:R3
230+
r0 := v.Args[0].Reg()
231+
r1 := v.Args[1].Reg()
232+
if r1 != s390x.REG_R3 {
233+
v.Fatalf("We require the multiplcand to be stored in R3 for MLGR %s", v.LongString())
234+
}
235+
p := s.Prog(s390x.AMLGR)
236+
p.From.Type = obj.TYPE_REG
237+
p.From.Reg = r0
238+
p.To.Reg = s390x.REG_R2
239+
p.To.Type = obj.TYPE_REG
240+
228241
case ssa.OpS390XFMADD, ssa.OpS390XFMADDS,
229242
ssa.OpS390XFMSUB, ssa.OpS390XFMSUBS:
230243
r := v.Reg()

src/cmd/compile/internal/ssa/gen/S390X.rules

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
(Mul(32|16|8) x y) -> (MULLW x y)
1818
(Mul32F x y) -> (FMULS x y)
1919
(Mul64F x y) -> (FMUL x y)
20+
(Mul64uhilo x y) -> (MLGR x y)
2021

2122
(Div32F x y) -> (FDIVS x y)
2223
(Div64F x y) -> (FDIV x y)

src/cmd/compile/internal/ssa/gen/S390XOps.go

+13
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,19 @@ func init() {
568568
clobberFlags: true,
569569
},
570570

571+
// unsigned multiplication (64x64 → 128)
572+
//
573+
// Multiply the two 64-bit input operands together and place the 128-bit result into
574+
// an even-odd register pair. The second register in the target pair also contains
575+
// one of the input operands. Since we don't currently have a way to specify an
576+
// even-odd register pair we hardcode this register pair as R2:R3.
577+
{
578+
name: "MLGR",
579+
argLength: 2,
580+
reg: regInfo{inputs: []regMask{gp, r3}, outputs: []regMask{r2, r3}},
581+
asm: "MLGR",
582+
},
583+
571584
// pseudo operations to sum the output of the POPCNT instruction
572585
{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
573586
{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow

src/cmd/compile/internal/ssa/opGen.go

+16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteS390X.go

+15
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/internal/obj/s390x/a.out.go

+1
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ const (
240240
AMULLD
241241
AMULHD
242242
AMULHDU
243+
AMLGR
243244
ASUB
244245
ASUBC
245246
ASUBV

src/cmd/internal/obj/s390x/anames.go

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/internal/obj/s390x/asmz.go

+4
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ var optab = []Optab{
174174
{i: 12, as: ASUB, a1: C_LAUTO, a6: C_REG},
175175
{i: 4, as: AMULHD, a1: C_REG, a6: C_REG},
176176
{i: 4, as: AMULHD, a1: C_REG, a2: C_REG, a6: C_REG},
177+
{i: 62, as: AMLGR, a1: C_REG, a6: C_REG},
177178
{i: 2, as: ADIVW, a1: C_REG, a2: C_REG, a6: C_REG},
178179
{i: 2, as: ADIVW, a1: C_REG, a6: C_REG},
179180
{i: 10, as: ASUB, a1: C_REG, a2: C_REG, a6: C_REG},
@@ -3407,6 +3408,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
34073408
d2 := c.regoff(&p.To)
34083409
zRXE(opcode, uint32(p.From.Reg), 0, 0, uint32(d2), 0, asm)
34093410

3411+
case 62: // equivalent of Mul64 in math/bits
3412+
zRRE(op_MLGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
3413+
34103414
case 66:
34113415
zRR(op_BCR, 0, 0, asm)
34123416

test/codegen/mathbits.go

+2
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,7 @@ func Mul(x, y uint) (hi, lo uint) {
557557
// arm64:"UMULH","MUL"
558558
// ppc64:"MULHDU","MULLD"
559559
// ppc64le:"MULHDU","MULLD"
560+
// s390x:"MLGR"
560561
return bits.Mul(x, y)
561562
}
562563

@@ -565,6 +566,7 @@ func Mul64(x, y uint64) (hi, lo uint64) {
565566
// arm64:"UMULH","MUL"
566567
// ppc64:"MULHDU","MULLD"
567568
// ppc64le:"MULHDU","MULLD"
569+
// s390x:"MLGR"
568570
return bits.Mul64(x, y)
569571
}
570572

0 commit comments

Comments
 (0)