-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Description
The generated x86 code can be improved in some fairly simple ways - hoisting computed constants out of loop bodies, and avoiding unnecessary register moves - that have a significant performance impact on tight loops. In the following example those improvements produce a 35% speedup.
Here is an alternate, DFA-based implementation of utf8.Valid that I have been playing with:
func Valid(x []byte) bool {
state := uint64(1 * 6)
for _, b := range x {
state = dfa[b] >> (state & 63)
}
return (state & 63) == 1*6
}
const (
s00 = 0 | (1*6)<<(1*6)
sC0 = 0
sC2 = 0 | (2*6)<<(1*6)
sE0 = 0 | (3*6)<<(1*6)
sE1 = 0 | (4*6)<<(1*6)
sED = 0 | (5*6)<<(1*6)
sEE = sE1
sF0 = 0 | (6*6)<<(1*6)
sF1 = 0 | (7*6)<<(1*6)
sF4 = 0 | (8*6)<<(1*6)
sF5 = 0
s80 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(7*6) | (4*6)<<(8*6)
s90 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
sA0 = 0 | (1*6)<<(2*6) | (2*6)<<(3*6) | (2*6)<<(4*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
)
var dfa = [256]uint64{
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80,
s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90,
sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
sC0, sC0, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
sE0, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sED, sEE, sEE,
sF0, sF1, sF1, sF1, sF4, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5,
}
There are no big benchmarks of Valid in the package, but here are some that could be added:
var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)
func BenchmarkValid1K(b *testing.B) {
b.SetBytes(int64(len(valid1k)))
for i := 0; i < b.N; i++ {
Valid(valid1k)
}
}
func BenchmarkValid1M(b *testing.B) {
b.SetBytes(int64(len(valid1M)))
for i := 0; i < b.N; i++ {
Valid(valid1M)
}
}
The old Valid implementation runs at around 1450 MB/s.
The implementation above runs at around 1600 MB/s.
Better but not what I had hoped.
It compiles as follows:
% go test -c && go tool objdump -s 'utf8.Valid$' utf8.test
TEXT unicode/utf8.Valid(SB) /Users/rsc/go/src/unicode/utf8/utf8.go
utf8.go:647 0x10789c0 4889442408 MOVQ AX, 0x8(SP)
utf8.go:649 0x10789c5 31c9 XORL CX, CX
utf8.go:649 0x10789c7 ba06000000 MOVL $0x6, DX
utf8.go:649 0x10789cc eb1f JMP 0x10789ed
utf8.go:649 0x10789ce 0fb63408 MOVZX 0(AX)(CX*1), SI
utf8.go:649 0x10789d2 488d7901 LEAQ 0x1(CX), DI
utf8.go:650 0x10789d6 4c8d0543ef1600 LEAQ unicode/utf8.dfa(SB), R8
utf8.go:650 0x10789dd 498b34f0 MOVQ 0(R8)(SI*8), SI
utf8.go:650 0x10789e1 4889d1 MOVQ DX, CX
utf8.go:650 0x10789e4 48d3ee SHRQ CL, SI
utf8.go:649 0x10789e7 4889f9 MOVQ DI, CX
utf8.go:652 0x10789ea 4889f2 MOVQ SI, DX
utf8.go:649 0x10789ed 4839cb CMPQ CX, BX
utf8.go:649 0x10789f0 7fdc JG 0x10789ce
utf8.go:652 0x10789f2 4883e23f ANDQ $0x3f, DX
utf8.go:652 0x10789f6 4883fa06 CMPQ $0x6, DX
utf8.go:652 0x10789fa 0f94c0 SETE AL
utf8.go:652 0x10789fd c3 RET
:-1 0x10789fe cc INT $0x3
:-1 0x10789ff cc INT $0x3
%
Translating this to proper non-regabi assembly I get:
#include "textflag.h"
TEXT ·Valid(SB),NOSPLIT,$0
MOVQ x_base+0(FP),AX // p = &x[0]
MOVQ x_len+8(FP),BX // n = len(x)
XORL CX, CX // i = 0
MOVL $6, DX
JMP loop
body:
MOVBLZX (AX)(CX*1), SI // b = p[i]
LEAQ 1(CX), DI // j = i+1
LEAQ ·dfa(SB), R8
MOVQ (R8)(SI*8), SI // t = dfa[b]
MOVQ DX, CX // CX = state
SHRQ CX, SI // t >>= state&63
MOVQ DI, CX // i = j
MOVQ SI, DX // state = t
loop:
CMPQ CX, BX
JLT body
ANDL $0x3f, DX
CMPL DX, $6
SETEQ AL
MOVB AX, ret+24(FP)
RET
This runs also at about 1600 MB/s.
First optimization: the LEAQ ·dfa(SB), R8 should be hoisted out of the loop body.
(I tried to do this in the Go version with dfa := &dfa but it got constant propagated away!)
That change brings it up to 1750 MB/s.
Second optimization: use DI for i instead of CX, to avoid the pressure on CX.
This lets the LEAQ 1(CX), DI and the later MOVQ DI, CX collapse to just LEAQ 1(DI), DI.
That change brings it up to 1900 MB/s.
The body is now:
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ (R8)(SI*8), SI // t = dfa[b]
MOVQ DX, CX // CX = state
SHRQ CX, SI // t >>= state&63
MOVQ SI, DX // state = t
Third optimization: since DX is moving into CX, do that one instruction earlier, allowing the use of SI to be optimized into DX to eliminate the final MOVQ:
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ DX, CX // CX = state
MOVQ (R8)(SI*8), DX // state = dfa[b]
SHRQ CX, DX // state >>= CX&63
I think this ends up being just "compute the shift amount before the shifted value".
That change brings it up to 2150 MB/s.
This is still a direct translation of the Go code: there are no tricks the compiler couldn't do.
For this particular loop, the optimizations make the code run 35% faster.
Final assembly:
#include "textflag.h"
TEXT ·Valid(SB),NOSPLIT,$0
MOVQ x_base+0(FP),AX // p = &x[0]
MOVQ x_len+8(FP),BX // n = len(x)
XORL DI, DI // i = 0
MOVL $6, DX
LEAQ ·dfa(SB), R8
JMP loop
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ DX, CX // CX = state
MOVQ (R8)(SI*8), DX // t = dfa[b]
SHRQ CX, DX // t >>= state&63
loop:
CMPQ DI, BX
JLT body
ANDL $0x3f, DX
CMPL DX, $6
SETEQ AL
MOVB AX, ret+24(FP)
RET
Metadata
Metadata
Assignees
Labels
Type
Projects
Status