Skip to content

Commit c4807d4

Browse files
labogerbradfitz
authored andcommitted
runtime: improve memmove performance ppc64,ppc64le
This change improves the performance of memmove on ppc64 & ppc64le mainly for moves >=32 bytes. In addition, the test to detect backward moves was enhanced to avoid backward moves if source and dest were in different types of storage, since backward moves might not always be efficient. Fixes #14507 The following shows some of the improvements from the test in the runtime package: BenchmarkMemmove32 4229.56 4717.13 1.12x BenchmarkMemmove64 6156.03 7810.42 1.27x BenchmarkMemmove128 7521.69 12468.54 1.66x BenchmarkMemmove256 6729.90 18260.33 2.71x BenchmarkMemmove512 8521.59 18033.81 2.12x BenchmarkMemmove1024 9760.92 25762.61 2.64x BenchmarkMemmove2048 10241.00 29584.94 2.89x BenchmarkMemmove4096 10399.37 31882.31 3.07x BenchmarkMemmoveUnalignedDst16 1943.69 2258.33 1.16x BenchmarkMemmoveUnalignedDst32 3885.08 3965.81 1.02x BenchmarkMemmoveUnalignedDst64 5121.63 6965.54 1.36x BenchmarkMemmoveUnalignedDst128 7212.34 11372.68 1.58x BenchmarkMemmoveUnalignedDst256 6564.52 16913.59 2.58x BenchmarkMemmoveUnalignedDst512 8364.35 17782.57 2.13x BenchmarkMemmoveUnalignedDst1024 9539.87 24914.72 2.61x BenchmarkMemmoveUnalignedDst2048 9199.23 21235.11 2.31x BenchmarkMemmoveUnalignedDst4096 10077.39 25231.99 2.50x BenchmarkMemmoveUnalignedSrc32 3249.83 3742.52 1.15x BenchmarkMemmoveUnalignedSrc64 5562.35 6627.96 1.19x BenchmarkMemmoveUnalignedSrc128 6023.98 10200.84 1.69x BenchmarkMemmoveUnalignedSrc256 6921.83 15258.43 2.20x BenchmarkMemmoveUnalignedSrc512 8593.13 16541.97 1.93x BenchmarkMemmoveUnalignedSrc1024 9730.95 22927.84 2.36x BenchmarkMemmoveUnalignedSrc2048 9793.28 21537.73 2.20x BenchmarkMemmoveUnalignedSrc4096 10132.96 26295.06 2.60x Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2 Reviewed-on: https://go-review.googlesource.com/21990 Reviewed-by: Bill O'Farrell <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 66afbf1 commit c4807d4

File tree

1 file changed

+74
-43
lines changed

1 file changed

+74
-43
lines changed

src/runtime/memmove_ppc64x.s

Lines changed: 74 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
1111
MOVD to+0(FP), R3
1212
MOVD from+8(FP), R4
1313
MOVD n+16(FP), R5
14-
CMP R5, $0
15-
BNE check
16-
RET
1714

15+
// Determine if there are doublewords to
16+
// copy so a more efficient move can be done
1817
check:
19-
ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
20-
SRAD $3, R5, R6 // R6 is the number of words to copy
21-
CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy.
22-
23-
CMP R3, R4, CR2
24-
BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
18+
ANDCC $7, R5, R7 // R7: bytes to copy
19+
SRAD $3, R5, R6 // R6: double words to copy
20+
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
2521

26-
// Copying forward proceeds by copying R6 words then copying R7 bytes.
27-
// R3 and R4 are advanced as we copy. Because PPC64 lacks post-increment
28-
// load/store, R3 and R4 point before the bytes that are to be copied.
22+
// Determine overlap by subtracting dest - src and comparing against the
23+
// length. The catches the cases where src and dest are in different types
24+
// of storage such as stack and static to avoid doing backward move when not
25+
// necessary.
2926

30-
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
31-
32-
MOVD R6, CTR
27+
SUB R4, R3, R8 // dest - src
28+
CMPU R8, R5, CR2 // < len?
29+
BC 12, 8, backward // BLT CR2 backward
3330

34-
SUB $8, R3
35-
SUB $8, R4
31+
// Copying forward if no overlap.
3632

37-
forwardlargeloop:
38-
MOVDU 8(R4), R8
39-
MOVDU R8, 8(R3)
40-
BC 16, 0, forwardlargeloop // "BDNZ"
41-
42-
ADD $8, R3
43-
ADD $8, R4
33+
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
34+
MOVD R6,CTR // R6 = number of double words
35+
SRADCC $2,R6,R8 // 32 byte chunks?
36+
BNE forward32setup //
37+
38+
// Move double words
39+
40+
forward8:
41+
MOVD 0(R4), R8 // double word
42+
ADD $8,R4
43+
MOVD R8, 0(R3) //
44+
ADD $8,R3
45+
BC 16, 0, forward8
46+
BR noforwardlarge // handle remainder
47+
48+
// Prepare for moves of 32 bytes at a time.
49+
50+
forward32setup:
51+
DCBTST (R3) // prepare data cache
52+
DCBT (R4)
53+
MOVD R8, CTR // double work count
54+
55+
forward32:
56+
MOVD 0(R4), R8 // load 4 double words
57+
MOVD 8(R4), R9
58+
MOVD 16(R4), R14
59+
MOVD 24(R4), R15
60+
ADD $32,R4
61+
MOVD R8, 0(R3) // store those 4
62+
MOVD R9, 8(R3)
63+
MOVD R14,16(R3)
64+
MOVD R15,24(R3)
65+
ADD $32,R3 // bump up for next set
66+
BC 16, 0, forward32 // continue
67+
RLDCLCC $61,R5,$3,R6 // remaining doublewords
68+
BEQ noforwardlarge
69+
MOVD R6,CTR // set up the CTR
70+
BR forward8
4471

4572
noforwardlarge:
46-
BNE forwardtail // Tests the bit set by ANDCC above
47-
RET
73+
CMP R7,$0 // any remaining bytes
74+
BC 4, 1, LR
4875

4976
forwardtail:
50-
SUB $1, R3
51-
SUB $1, R4
52-
MOVD R7, CTR
77+
MOVD R7, CTR // move tail bytes
5378

5479
forwardtailloop:
55-
MOVBZU 1(R4), R8
56-
MOVBZU R8, 1(R3)
80+
MOVBZ 0(R4), R8 // move single bytes
81+
ADD $1,R4
82+
MOVBZ R8, 0(R3)
83+
ADD $1,R3
5784
BC 16, 0, forwardtailloop
5885
RET
5986

6087
backward:
61-
// Copying backwards proceeds by copying R7 bytes then copying R6 words.
88+
// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
6289
// R3 and R4 are advanced to the end of the destination/source buffers
6390
// respectively and moved back as we copy.
6491

65-
ADD R5, R4, R4
66-
ADD R3, R5, R3
92+
ADD R5, R4, R4 // end of source
93+
ADD R3, R5, R3 // end of dest
6794

68-
BEQ nobackwardtail
95+
BEQ nobackwardtail // earlier condition
6996

70-
MOVD R7, CTR
97+
MOVD R7, CTR // bytes to move
7198

7299
backwardtailloop:
73-
MOVBZU -1(R4), R8
74-
MOVBZU R8, -1(R3)
100+
MOVBZ -1(R4), R8 // point to last byte
101+
SUB $1,R4
102+
MOVBZ R8, -1(R3)
103+
SUB $1,R3
75104
BC 16, 0, backwardtailloop
76105

77106
nobackwardtail:
78-
BC 4, 6, backwardlarge // "BNE CR1"
79-
RET
107+
CMP R6,$0
108+
BC 4, 5, LR
80109

81110
backwardlarge:
82111
MOVD R6, CTR
83112

84113
backwardlargeloop:
85-
MOVDU -8(R4), R8
86-
MOVDU R8, -8(R3)
87-
BC 16, 0, backwardlargeloop // "BDNZ"
114+
MOVD -8(R4), R8
115+
SUB $8,R4
116+
MOVD R8, -8(R3)
117+
SUB $8,R3
118+
BC 16, 0, backwardlargeloop //
88119
RET

0 commit comments

Comments
 (0)