Skip to content

Commit ecd9e8a

Browse files
mrosier-qdtcherrymui
authored andcommitted
cmd/compile/internal/ssa: combine zero stores into larger stores on arm64
This reduces the go tool binary on arm64 by 12k. go1 results on Amberwing: name old time/op new time/op delta RegexpMatchEasy0_32 249ns ± 0% 249ns ± 0% ~ (p=0.087 n=10+10) RegexpMatchEasy0_1K 584ns ± 0% 584ns ± 0% ~ (all equal) RegexpMatchEasy1_32 246ns ± 0% 246ns ± 0% ~ (p=1.000 n=10+10) RegexpMatchEasy1_1K 806ns ± 0% 806ns ± 0% ~ (p=0.706 n=10+9) RegexpMatchMedium_32 314ns ± 0% 314ns ± 0% ~ (all equal) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% ~ (p=0.245 n=10+8) RegexpMatchHard_32 2.75µs ± 1% 2.75µs ± 1% ~ (p=0.690 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 78.9µs ± 1% ~ (p=0.295 n=9+9) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) FmtFprintfString 112ns ± 0% 112ns ± 0% ~ (all equal) FmtFprintfInt 117ns ± 0% 116ns ± 0% -0.85% (p=0.000 n=10+10) FmtFprintfIntInt 181ns ± 0% 181ns ± 0% ~ (all equal) FmtFprintfPrefixedInt 222ns ± 0% 224ns ± 0% +0.90% (p=0.000 n=9+10) FmtFprintfFloat 318ns ± 1% 322ns ± 0% ~ (p=0.059 n=10+8) FmtManyArgs 736ns ± 1% 735ns ± 0% ~ (p=0.206 n=9+9) Gzip 437ms ± 0% 436ms ± 0% -0.25% (p=0.000 n=10+10) HTTPClientServer 89.8µs ± 1% 90.2µs ± 2% ~ (p=0.393 n=10+10) JSONEncode 20.1ms ± 1% 20.2ms ± 1% ~ (p=0.065 n=9+10) JSONDecode 94.2ms ± 1% 93.9ms ± 1% -0.42% (p=0.043 n=10+10) GobDecode 12.7ms ± 1% 12.8ms ± 2% +0.94% (p=0.019 n=10+10) GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.052 n=10+10) Mandelbrot200 5.06ms ± 0% 5.05ms ± 0% -0.04% (p=0.000 n=9+10) TimeParse 450ns ± 3% 446ns ± 0% ~ (p=0.238 n=10+9) TimeFormat 485ns ± 1% 483ns ± 1% ~ (p=0.073 n=10+10) Template 90.4ms ± 0% 90.7ms ± 0% +0.29% (p=0.000 n=8+10) GoParse 6.01ms ± 0% 6.03ms ± 0% +0.35% (p=0.000 n=10+10) BinaryTree17 11.7s ± 0% 11.7s ± 0% ~ (p=0.481 n=10+10) Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.315 n=10+10) Fannkuch11 3.40s ± 0% 3.37s ± 0% -0.92% (p=0.000 n=10+10) [Geo mean] 67.9µs 67.9µs +0.02% name old speed new speed delta RegexpMatchEasy0_32 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.003 n=8+10) RegexpMatchEasy0_1K 1.75GB/s ± 0% 1.75GB/s ± 0% ~ (p=0.642 n=8+10) RegexpMatchEasy1_32 130MB/s ± 0% 130MB/s ± 0% ~ (p=0.690 n=10+9) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% ~ (p=0.661 n=10+9) RegexpMatchMedium_32 3.18MB/s ± 0% 3.18MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K 19.7MB/s ± 0% 19.6MB/s ± 0% ~ (p=0.190 n=10+9) RegexpMatchHard_32 11.6MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.669 n=10+10) RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.718 n=9+9) Gzip 44.4MB/s ± 0% 44.5MB/s ± 0% +0.24% (p=0.000 n=10+10) JSONEncode 96.5MB/s ± 1% 96.1MB/s ± 1% ~ (p=0.065 n=9+10) JSONDecode 20.6MB/s ± 1% 20.7MB/s ± 1% +0.42% (p=0.041 n=10+10) GobDecode 60.6MB/s ± 1% 60.0MB/s ± 2% -0.92% (p=0.016 n=10+10) GobEncode 63.4MB/s ± 0% 63.6MB/s ± 0% ~ (p=0.055 n=10+10) Template 21.5MB/s ± 0% 21.4MB/s ± 0% -0.30% (p=0.000 n=9+10) GoParse 9.64MB/s ± 0% 9.61MB/s ± 0% -0.36% (p=0.000 n=10+10) Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.323 n=10+10) [Geo mean] 56.0MB/s 55.9MB/s -0.07% Change-Id: Ia732fa57fbcf4767d72382516d9f16705d177736 Reviewed-on: https://go-review.googlesource.com/96435 Run-TryBot: Cherry Zhang <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent 3a9e444 commit ecd9e8a

File tree

4 files changed

+376
-0
lines changed

4 files changed

+376
-0
lines changed

src/cmd/compile/internal/gc/asm_test.go

+226
Original file line numberDiff line numberDiff line change
@@ -2971,6 +2971,232 @@ var linuxARM64Tests = []*asmTest{
29712971
`,
29722972
pos: []string{"\tCSEL\t"},
29732973
},
2974+
// Check that zero stores are combine into larger stores
2975+
{
2976+
fn: `
2977+
func $(b []byte) {
2978+
_ = b[1] // early bounds check to guarantee safety of writes below
2979+
b[0] = 0
2980+
b[1] = 0
2981+
}
2982+
`,
2983+
pos: []string{"MOVH\tZR"},
2984+
neg: []string{"MOVB"},
2985+
},
2986+
{
2987+
fn: `
2988+
func $(b []byte) {
2989+
_ = b[1] // early bounds check to guarantee safety of writes below
2990+
b[1] = 0
2991+
b[0] = 0
2992+
}
2993+
`,
2994+
pos: []string{"MOVH\tZR"},
2995+
neg: []string{"MOVB"},
2996+
},
2997+
{
2998+
fn: `
2999+
func $(b []byte) {
3000+
_ = b[3] // early bounds check to guarantee safety of writes below
3001+
b[0] = 0
3002+
b[1] = 0
3003+
b[2] = 0
3004+
b[3] = 0
3005+
}
3006+
`,
3007+
pos: []string{"MOVW\tZR"},
3008+
neg: []string{"MOVB", "MOVH"},
3009+
},
3010+
{
3011+
fn: `
3012+
func $(b []byte) {
3013+
_ = b[3] // early bounds check to guarantee safety of writes below
3014+
b[2] = 0
3015+
b[3] = 0
3016+
b[1] = 0
3017+
b[0] = 0
3018+
}
3019+
`,
3020+
pos: []string{"MOVW\tZR"},
3021+
neg: []string{"MOVB", "MOVH"},
3022+
},
3023+
{
3024+
fn: `
3025+
func $(h []uint16) {
3026+
_ = h[1] // early bounds check to guarantee safety of writes below
3027+
h[0] = 0
3028+
h[1] = 0
3029+
}
3030+
`,
3031+
pos: []string{"MOVW\tZR"},
3032+
neg: []string{"MOVB", "MOVH"},
3033+
},
3034+
{
3035+
fn: `
3036+
func $(h []uint16) {
3037+
_ = h[1] // early bounds check to guarantee safety of writes below
3038+
h[1] = 0
3039+
h[0] = 0
3040+
}
3041+
`,
3042+
pos: []string{"MOVW\tZR"},
3043+
neg: []string{"MOVB", "MOVH"},
3044+
},
3045+
{
3046+
fn: `
3047+
func $(b []byte) {
3048+
_ = b[7] // early bounds check to guarantee safety of writes below
3049+
b[0] = 0
3050+
b[1] = 0
3051+
b[2] = 0
3052+
b[3] = 0
3053+
b[4] = 0
3054+
b[5] = 0
3055+
b[6] = 0
3056+
b[7] = 0
3057+
}
3058+
`,
3059+
pos: []string{"MOVD\tZR"},
3060+
neg: []string{"MOVB", "MOVH", "MOVW"},
3061+
},
3062+
{
3063+
fn: `
3064+
func $(h []uint16) {
3065+
_ = h[3] // early bounds check to guarantee safety of writes below
3066+
h[0] = 0
3067+
h[1] = 0
3068+
h[2] = 0
3069+
h[3] = 0
3070+
}
3071+
`,
3072+
pos: []string{"MOVD\tZR"},
3073+
neg: []string{"MOVB", "MOVH", "MOVW"},
3074+
},
3075+
{
3076+
fn: `
3077+
func $(h []uint16) {
3078+
_ = h[3] // early bounds check to guarantee safety of writes below
3079+
h[2] = 0
3080+
h[3] = 0
3081+
h[1] = 0
3082+
h[0] = 0
3083+
}
3084+
`,
3085+
pos: []string{"MOVD\tZR"},
3086+
neg: []string{"MOVB", "MOVH", "MOVW"},
3087+
},
3088+
{
3089+
fn: `
3090+
func $(w []uint32) {
3091+
_ = w[1] // early bounds check to guarantee safety of writes below
3092+
w[0] = 0
3093+
w[1] = 0
3094+
}
3095+
`,
3096+
pos: []string{"MOVD\tZR"},
3097+
neg: []string{"MOVB", "MOVH", "MOVW"},
3098+
},
3099+
{
3100+
fn: `
3101+
func $(w []uint32) {
3102+
_ = w[1] // early bounds check to guarantee safety of writes below
3103+
w[1] = 0
3104+
w[0] = 0
3105+
}
3106+
`,
3107+
pos: []string{"MOVD\tZR"},
3108+
neg: []string{"MOVB", "MOVH", "MOVW"},
3109+
},
3110+
{
3111+
fn: `
3112+
func $(b []byte) {
3113+
_ = b[15] // early bounds check to guarantee safety of writes below
3114+
b[0] = 0
3115+
b[1] = 0
3116+
b[2] = 0
3117+
b[3] = 0
3118+
b[4] = 0
3119+
b[5] = 0
3120+
b[6] = 0
3121+
b[7] = 0
3122+
b[8] = 0
3123+
b[9] = 0
3124+
b[10] = 0
3125+
b[11] = 0
3126+
b[12] = 0
3127+
b[13] = 0
3128+
b[15] = 0
3129+
b[14] = 0
3130+
}
3131+
`,
3132+
pos: []string{"STP"},
3133+
neg: []string{"MOVB", "MOVH", "MOVW"},
3134+
},
3135+
{
3136+
fn: `
3137+
func $(h []uint16) {
3138+
_ = h[7] // early bounds check to guarantee safety of writes below
3139+
h[0] = 0
3140+
h[1] = 0
3141+
h[2] = 0
3142+
h[3] = 0
3143+
h[4] = 0
3144+
h[5] = 0
3145+
h[6] = 0
3146+
h[7] = 0
3147+
}
3148+
`,
3149+
pos: []string{"STP"},
3150+
neg: []string{"MOVB", "MOVH"},
3151+
},
3152+
{
3153+
fn: `
3154+
func $(w []uint32) {
3155+
_ = w[3] // early bounds check to guarantee safety of writes below
3156+
w[0] = 0
3157+
w[1] = 0
3158+
w[2] = 0
3159+
w[3] = 0
3160+
}
3161+
`,
3162+
pos: []string{"STP"},
3163+
neg: []string{"MOVB", "MOVH"},
3164+
},
3165+
{
3166+
fn: `
3167+
func $(w []uint32) {
3168+
_ = w[3] // early bounds check to guarantee safety of writes below
3169+
w[1] = 0
3170+
w[0] = 0
3171+
w[3] = 0
3172+
w[2] = 0
3173+
}
3174+
`,
3175+
pos: []string{"STP"},
3176+
neg: []string{"MOVB", "MOVH"},
3177+
},
3178+
{
3179+
fn: `
3180+
func $(d []uint64) {
3181+
_ = d[1] // early bounds check to guarantee safety of writes below
3182+
d[0] = 0
3183+
d[1] = 0
3184+
}
3185+
`,
3186+
pos: []string{"STP"},
3187+
neg: []string{"MOVB", "MOVH"},
3188+
},
3189+
{
3190+
fn: `
3191+
func $(d []uint64) {
3192+
_ = d[1] // early bounds check to guarantee safety of writes below
3193+
d[1] = 0
3194+
d[0] = 0
3195+
}
3196+
`,
3197+
pos: []string{"STP"},
3198+
neg: []string{"MOVB", "MOVH"},
3199+
},
29743200
}
29753201

29763202
var linuxMIPSTests = []*asmTest{

src/cmd/compile/internal/ssa/gen/ARM64.rules

+30
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,36 @@
14391439
&& clobber(o4) && clobber(o5) && clobber(s0)
14401440
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem))
14411441

1442+
// Combine zero stores into larger (unaligned) stores.
1443+
(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
1444+
&& x.Uses == 1
1445+
&& areAdjacentOffsets(i,j,1)
1446+
&& is32Bit(min(i,j))
1447+
&& isSamePtr(ptr0, ptr1)
1448+
&& clobber(x)
1449+
-> (MOVHstorezero [min(i,j)] {s} ptr0 mem)
1450+
(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
1451+
&& x.Uses == 1
1452+
&& areAdjacentOffsets(i,j,2)
1453+
&& is32Bit(min(i,j))
1454+
&& isSamePtr(ptr0, ptr1)
1455+
&& clobber(x)
1456+
-> (MOVWstorezero [min(i,j)] {s} ptr0 mem)
1457+
(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
1458+
&& x.Uses == 1
1459+
&& areAdjacentOffsets(i,j,4)
1460+
&& is32Bit(min(i,j))
1461+
&& isSamePtr(ptr0, ptr1)
1462+
&& clobber(x)
1463+
-> (MOVDstorezero [min(i,j)] {s} ptr0 mem)
1464+
(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
1465+
&& x.Uses == 1
1466+
&& areAdjacentOffsets(i,j,8)
1467+
&& is32Bit(min(i,j))
1468+
&& isSamePtr(ptr0, ptr1)
1469+
&& clobber(x)
1470+
-> (MOVQstorezero [min(i,j)] {s} ptr0 mem)
1471+
14421472
// FP simplification
14431473
(FNEGS (FMULS x y)) -> (FNMULS x y)
14441474
(FNEGD (FMULD x y)) -> (FNMULD x y)

src/cmd/compile/internal/ssa/rewrite.go

+4
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,10 @@ func overlap(offset1, size1, offset2, size2 int64) bool {
769769
return false
770770
}
771771

772+
func areAdjacentOffsets(off1, off2, size int64) bool {
773+
return off1+size == off2 || off1 == off2+size
774+
}
775+
772776
// check if value zeroes out upper 32-bit of 64-bit register.
773777
// depth limits recursion depth. In AMD64.rules 3 is used as limit,
774778
// because it catches same amount of cases as 4.

0 commit comments

Comments
 (0)