Skip to content

Commit deb4177

Browse files
committed
cmd/compile: use masks instead of branches for slicing
When we do var x []byte = ... y := x[i:] We can't just use y.ptr = x.ptr + i, as the new pointer may point to the next object in memory after the backing array. We used to fix this by doing: y.cap = x.cap - i delta := i if y.cap == 0 { delta = 0 } y.ptr = x.ptr + delta That generates a branch in what is otherwise straight-line code. Better to do: y.cap = x.cap - i mask := (y.cap - 1) >> 63 // -1 if y.cap==0, 0 otherwise y.ptr = x.ptr + i &^ mask It's about the same number of instructions (~4, depending on what parts are constant, and the target architecture), but it is all inline. It plays nicely with CSE, and the mask can be computed in parallel with the index (in cases where a multiply is required). It is a minor win in both speed and space. Change-Id: Ied60465a0b8abb683c02208402e5bb7ac0e8370f Reviewed-on: https://go-review.googlesource.com/32022 Run-TryBot: Keith Randall <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent 50f66fb commit deb4177

21 files changed

+325
-56
lines changed

src/cmd/compile/internal/gc/ssa.go

Lines changed: 20 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,6 @@ var (
295295
typVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "typ"}}
296296
idataVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "idata"}}
297297
okVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "ok"}}
298-
deltaVar = Node{Op: ONAME, Class: Pxxx, Sym: &Sym{Name: "delta"}}
299298
)
300299

301300
// startBlock sets the current block we're generating code in to b.
@@ -3516,19 +3515,17 @@ func (s *state) slice(t *Type, v, i, j, k *ssa.Value) (p, l, c *ssa.Value) {
35163515
}
35173516

35183517
// Generate the following code assuming that indexes are in bounds.
3519-
// The conditional is to make sure that we don't generate a slice
3518+
// The masking is to make sure that we don't generate a slice
35203519
// that points to the next object in memory.
3521-
// rlen = j-i
3522-
// rcap = k-i
3523-
// delta = i*elemsize
3524-
// if rcap == 0 {
3525-
// delta = 0
3526-
// }
3527-
// rptr = p+delta
3520+
// rlen = j - i
3521+
// rcap = k - i
3522+
// delta = i * elemsize
3523+
// rptr = p + delta&mask(rcap)
35283524
// result = (SliceMake rptr rlen rcap)
3525+
// where mask(x) is 0 if x==0 and -1 if x>0.
35293526
subOp := s.ssaOp(OSUB, Types[TINT])
3530-
eqOp := s.ssaOp(OEQ, Types[TINT])
35313527
mulOp := s.ssaOp(OMUL, Types[TINT])
3528+
andOp := s.ssaOp(OAND, Types[TINT])
35323529
rlen := s.newValue2(subOp, Types[TINT], j, i)
35333530
var rcap *ssa.Value
35343531
switch {
@@ -3543,38 +3540,21 @@ func (s *state) slice(t *Type, v, i, j, k *ssa.Value) (p, l, c *ssa.Value) {
35433540
rcap = s.newValue2(subOp, Types[TINT], k, i)
35443541
}
35453542

3546-
// delta = # of elements to offset pointer by.
3547-
s.vars[&deltaVar] = i
3548-
3549-
// Generate code to set delta=0 if the resulting capacity is zero.
3550-
if !((i.Op == ssa.OpConst64 && i.AuxInt == 0) ||
3551-
(i.Op == ssa.OpConst32 && int32(i.AuxInt) == 0)) {
3552-
cmp := s.newValue2(eqOp, Types[TBOOL], rcap, zero)
3553-
3554-
b := s.endBlock()
3555-
b.Kind = ssa.BlockIf
3556-
b.Likely = ssa.BranchUnlikely
3557-
b.SetControl(cmp)
3558-
3559-
// Generate block which zeros the delta variable.
3560-
nz := s.f.NewBlock(ssa.BlockPlain)
3561-
b.AddEdgeTo(nz)
3562-
s.startBlock(nz)
3563-
s.vars[&deltaVar] = zero
3564-
s.endBlock()
3565-
3566-
// All done.
3567-
merge := s.f.NewBlock(ssa.BlockPlain)
3568-
b.AddEdgeTo(merge)
3569-
nz.AddEdgeTo(merge)
3570-
s.startBlock(merge)
3571-
3572-
// TODO: use conditional moves somehow?
3543+
var rptr *ssa.Value
3544+
if (i.Op == ssa.OpConst64 || i.Op == ssa.OpConst32) && i.AuxInt == 0 {
3545+
// No pointer arithmetic necessary.
3546+
rptr = ptr
3547+
} else {
3548+
// delta = # of bytes to offset pointer by.
3549+
delta := s.newValue2(mulOp, Types[TINT], i, s.constInt(Types[TINT], elemtype.Width))
3550+
// If we're slicing to the point where the capacity is zero,
3551+
// zero out the delta.
3552+
mask := s.newValue1(ssa.OpSlicemask, Types[TINT], rcap)
3553+
delta = s.newValue2(andOp, Types[TINT], delta, mask)
3554+
// Compute rptr = ptr + delta
3555+
rptr = s.newValue2(ssa.OpAddPtr, ptrtype, ptr, delta)
35733556
}
35743557

3575-
// Compute rptr = ptr + delta * elemsize
3576-
rptr := s.newValue2(ssa.OpAddPtr, ptrtype, ptr, s.newValue2(mulOp, Types[TINT], s.variable(&deltaVar, Types[TINT]), s.constInt(Types[TINT], elemtype.Width)))
3577-
delete(s.vars, &deltaVar)
35783558
return rptr, rlen, rcap
35793559
}
35803560

src/cmd/compile/internal/ssa/gen/386.rules

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,8 @@
101101
(ZeroExt16to32 x) -> (MOVWLZX x)
102102

103103
(Signmask x) -> (SARLconst x [31])
104-
(Zeromask <t> x) -> (XORLconst [-1] (SBBLcarrymask <t> (CMPL x (MOVLconst [1]))))
104+
(Zeromask <t> x) -> (XORLconst [-1] (SBBLcarrymask <t> (CMPLconst x [1])))
105+
(Slicemask <t> x) -> (XORLconst [-1] (SARLconst <t> (SUBLconst <t> x [1]) [31]))
105106

106107
// Lowering truncation
107108
// Because we ignore high parts of registers, truncates are just copies.

src/cmd/compile/internal/ssa/gen/AMD64.rules

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@
125125
(ZeroExt16to64 x) -> (MOVWQZX x)
126126
(ZeroExt32to64 x) -> (MOVLQZX x)
127127

128+
(Slicemask <t> x) -> (XORQconst [-1] (SARQconst <t> (SUBQconst <t> x [1]) [63]))
129+
128130
// Lowering truncation
129131
// Because we ignore high parts of registers, truncates are just copies.
130132
(Trunc16to8 x) -> x

src/cmd/compile/internal/ssa/gen/ARM.rules

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@
207207

208208
(Signmask x) -> (SRAconst x [31])
209209
(Zeromask x) -> (SRAconst (RSBshiftRL <config.fe.TypeInt32()> x x [1]) [31]) // sign bit of uint32(x)>>1 - x
210+
(Slicemask <t> x) -> (MVN (SRAconst <t> (SUBconst <t> x [1]) [31]))
210211

211212
// float <-> int conversion
212213
(Cvt32to32F x) -> (MOVWF x)

src/cmd/compile/internal/ssa/gen/ARM64.rules

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@
207207
(ConstNil) -> (MOVDconst [0])
208208
(ConstBool [b]) -> (MOVDconst [b])
209209

210+
(Slicemask <t> x) -> (MVN (SRAconst <t> (SUBconst <t> x [1]) [63]))
211+
210212
// truncations
211213
// Because we ignore high parts of registers, truncates are just copies.
212214
(Trunc16to8 x) -> x

src/cmd/compile/internal/ssa/gen/MIPS64.rules

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@
152152
(OrB x y) -> (OR x y)
153153
(EqB x y) -> (XOR (MOVVconst [1]) (XOR <config.fe.TypeBool()> x y))
154154
(NeqB x y) -> (XOR x y)
155-
(Not x) -> (XOR (MOVVconst [1]) x)
155+
(Not x) -> (XORconst [1] x)
156156

157157
// constants
158158
(Const64 [val]) -> (MOVVconst [val])
@@ -164,6 +164,8 @@
164164
(ConstNil) -> (MOVVconst [0])
165165
(ConstBool [b]) -> (MOVVconst [b])
166166

167+
(Slicemask <t> x) -> (NORconst [0] (SRAVconst <t> (SUBVconst <t> x [1]) [63]))
168+
167169
// truncations
168170
// Because we ignore high parts of registers, truncates are just copies.
169171
(Trunc16to8 x) -> x

src/cmd/compile/internal/ssa/gen/PPC64.rules

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,8 @@
790790
(Trunc64to16 x) -> (MOVHreg x)
791791
(Trunc64to32 x) -> (MOVWreg x)
792792

793+
(Slicemask <t> x) -> (XORconst [-1] (SRADconst <t> (ADDconst <t> x [-1]) [63]))
794+
793795
// Note that MOV??reg returns a 64-bit int, x is not necessarily that wide
794796
// This may interact with other patterns in the future. (Compare with arm64)
795797
(MOVBZreg x:(MOVBZload _ _)) -> x

src/cmd/compile/internal/ssa/gen/S390X.rules

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@
152152
(ZeroExt16to64 x) -> (MOVHZreg x)
153153
(ZeroExt32to64 x) -> (MOVWZreg x)
154154

155+
(Slicemask <t> x) -> (XOR (MOVDconst [-1]) (SRADconst <t> (SUBconst <t> x [1]) [63]))
156+
155157
// Lowering truncation
156158
// Because we ignore high parts of registers, truncates are just copies.
157159
(Trunc16to8 x) -> x

src/cmd/compile/internal/ssa/gen/generic.rules

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,11 @@
602602
(Trunc32to16 (And32 (Const32 [y]) x)) && y&0xFFFF == 0xFFFF -> (Trunc32to16 x)
603603
(Trunc16to8 (And16 (Const16 [y]) x)) && y&0xFF == 0xFF -> (Trunc16to8 x)
604604

605+
(Slicemask (Const32 [x])) && x > 0 -> (Const32 [-1])
606+
(Slicemask (Const32 [0])) -> (Const32 [0])
607+
(Slicemask (Const64 [x])) && x > 0 -> (Const64 [-1])
608+
(Slicemask (Const64 [0])) -> (Const64 [0])
609+
605610
// Rewrite AND of consts as shifts if possible, slightly faster for 64 bit operands
606611
// leading zeros can be shifted left, then right
607612
(And64 <t> (Const64 [y]) x) && nlz(y) + nto(y) == 64 && nto(y) >= 32

src/cmd/compile/internal/ssa/gen/genericOps.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ var genericOps = []opData{
437437

438438
{name: "Signmask", argLength: 1, typ: "Int32"}, // 0 if arg0 >= 0, -1 if arg0 < 0
439439
{name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0
440+
{name: "Slicemask", argLength: 1}, // 0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0. Type is native int size.
440441

441442
{name: "Cvt32Uto32F", argLength: 1}, // uint32 -> float32, only used on 32-bit arch
442443
{name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch

0 commit comments

Comments
 (0)