Skip to content

Commit 3bdc2f3

Browse files
committed
cmd/compile/internal/gc: speed-up small array comparison
Currently we inline array comparisons for arrays with at most 4 elements. Compare arrays with small size, but more than 4 elements (e. g. [16]byte) with larger compares. This provides very slightly smaller binaries, and results in faster code. ArrayEqual-6 7.41ns ± 0% 3.17ns ± 0% -57.15% (p=0.000 n=10+10) For go tool: global text (code) = -559 bytes (-0.014566%) This also helps mapaccess1_faststr, and maps in general: MapDelete/Str/1-6 195ns ± 1% 186ns ± 2% -4.47% (p=0.000 n=10+10) MapDelete/Str/2-6 211ns ± 1% 177ns ± 1% -16.01% (p=0.000 n=10+10) MapDelete/Str/4-6 225ns ± 1% 183ns ± 1% -18.49% (p=0.000 n=8+10) MapStringKeysEight_16-6 31.3ns ± 0% 28.6ns ± 0% -8.63% (p=0.000 n=6+9) MapStringKeysEight_32-6 29.2ns ± 0% 27.6ns ± 0% -5.45% (p=0.000 n=10+10) MapStringKeysEight_64-6 29.1ns ± 1% 27.5ns ± 0% -5.46% (p=0.000 n=10+10) MapStringKeysEight_1M-6 29.1ns ± 1% 27.6ns ± 0% -5.49% (p=0.000 n=10+10) Change-Id: I9ec98e41b233031e0e96c4e13d86a324f628ed4a Reviewed-on: https://go-review.googlesource.com/40771 Run-TryBot: Ilya Tocar <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent 2f73efa commit 3bdc2f3

File tree

3 files changed

+93
-9
lines changed

3 files changed

+93
-9
lines changed

src/cmd/compile/internal/gc/asm_test.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ var allAsmTests = []*asmTests{
166166
{
167167
arch: "amd64",
168168
os: "linux",
169-
imports: []string{"encoding/binary", "math/bits"},
169+
imports: []string{"encoding/binary", "math/bits", "unsafe"},
170170
tests: linuxAMD64Tests,
171171
},
172172
{
@@ -869,6 +869,35 @@ var linuxAMD64Tests = []*asmTest{
869869
}`,
870870
[]string{"\tRORB\t"},
871871
},
872+
// Check that array compare uses 2/4/8 byte compares
873+
{
874+
`
875+
func f68(a,b [2]byte) bool {
876+
return a == b
877+
}`,
878+
[]string{"\tCMPW\t[A-Z]"},
879+
},
880+
{
881+
`
882+
func f69(a,b [3]uint16) bool {
883+
return a == b
884+
}`,
885+
[]string{"\tCMPL\t[A-Z]"},
886+
},
887+
{
888+
`
889+
func f70(a,b [15]byte) bool {
890+
return a == b
891+
}`,
892+
[]string{"\tCMPQ\t[A-Z]"},
893+
},
894+
{
895+
`
896+
func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr
897+
return *((*[4]byte)(a)) != *((*[4]byte)(b))
898+
}`,
899+
[]string{"\tCMPL\t[A-Z]"},
900+
},
872901
}
873902

874903
var linux386Tests = []*asmTest{

src/cmd/compile/internal/gc/walk.go

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3243,11 +3243,25 @@ func walkcompare(n *Node, init *Nodes) *Node {
32433243
// inline or call an eq alg.
32443244
t := n.Left.Type
32453245
var inline bool
3246+
3247+
maxcmpsize := int64(4)
3248+
unalignedLoad := false
3249+
switch thearch.LinkArch.Family {
3250+
case sys.AMD64, sys.ARM64, sys.S390X:
3251+
// Keep this low enough, to generate less code than function call.
3252+
maxcmpsize = 16
3253+
unalignedLoad = true
3254+
case sys.I386:
3255+
maxcmpsize = 8
3256+
unalignedLoad = true
3257+
}
3258+
32463259
switch t.Etype {
32473260
default:
32483261
return n
32493262
case TARRAY:
3250-
inline = t.NumElem() <= 1 || (t.NumElem() <= 4 && issimple[t.Elem().Etype])
3263+
// We can compare several elements at once with 2/4/8 byte integer compares
3264+
inline = t.NumElem() <= 1 || (issimple[t.Elem().Etype] && (t.NumElem() <= 4 || t.Elem().Width*t.NumElem() <= maxcmpsize))
32513265
case TSTRUCT:
32523266
inline = t.NumFields() <= 4
32533267
}
@@ -3333,11 +3347,54 @@ func walkcompare(n *Node, init *Nodes) *Node {
33333347
)
33343348
}
33353349
} else {
3336-
for i := 0; int64(i) < t.NumElem(); i++ {
3337-
compare(
3338-
nod(OINDEX, cmpl, nodintconst(int64(i))),
3339-
nod(OINDEX, cmpr, nodintconst(int64(i))),
3340-
)
3350+
step := int64(1)
3351+
remains := t.NumElem() * t.Elem().Width
3352+
combine64bit := unalignedLoad && Widthreg == 8 && t.Elem().Width <= 4 && t.Elem().IsInteger()
3353+
combine32bit := unalignedLoad && t.Elem().Width <= 2 && t.Elem().IsInteger()
3354+
combine16bit := unalignedLoad && t.Elem().Width == 1 && t.Elem().IsInteger()
3355+
for i := int64(0); remains > 0; {
3356+
var convType *types.Type
3357+
switch {
3358+
case remains >= 8 && combine64bit:
3359+
convType = types.Types[TINT64]
3360+
step = 8 / t.Elem().Width
3361+
case remains >= 4 && combine32bit:
3362+
convType = types.Types[TUINT32]
3363+
step = 4 / t.Elem().Width
3364+
case remains >= 2 && combine16bit:
3365+
convType = types.Types[TUINT16]
3366+
step = 2 / t.Elem().Width
3367+
default:
3368+
step = 1
3369+
}
3370+
if step == 1 {
3371+
compare(
3372+
nod(OINDEX, cmpl, nodintconst(int64(i))),
3373+
nod(OINDEX, cmpr, nodintconst(int64(i))),
3374+
)
3375+
i++
3376+
remains -= t.Elem().Width
3377+
} else {
3378+
cmplw := nod(OINDEX, cmpl, nodintconst(int64(i)))
3379+
cmplw = conv(cmplw, convType)
3380+
cmprw := nod(OINDEX, cmpr, nodintconst(int64(i)))
3381+
cmprw = conv(cmprw, convType)
3382+
// For code like this: uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 ...
3383+
// ssa will generate a single large load.
3384+
for offset := int64(1); offset < step; offset++ {
3385+
lb := nod(OINDEX, cmpl, nodintconst(int64(i+offset)))
3386+
lb = conv(lb, convType)
3387+
lb = nod(OLSH, lb, nodintconst(int64(8*t.Elem().Width*offset)))
3388+
cmplw = nod(OOR, cmplw, lb)
3389+
rb := nod(OINDEX, cmpr, nodintconst(int64(i+offset)))
3390+
rb = conv(rb, convType)
3391+
rb = nod(OLSH, rb, nodintconst(int64(8*t.Elem().Width*offset)))
3392+
cmprw = nod(OOR, cmprw, rb)
3393+
}
3394+
compare(cmplw, cmprw)
3395+
i += step
3396+
remains -= step * t.Elem().Width
3397+
}
33413398
}
33423399
}
33433400
if expr == nil {

src/runtime/hashmap_fast.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,6 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
252252
return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
253253
}
254254
// check first 4 bytes
255-
// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
256-
// four 1-byte comparisons.
257255
if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
258256
continue
259257
}

0 commit comments

Comments
 (0)