@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
11
11
MOVD to + 0 (FP) , R3
12
12
MOVD from + 8 (FP) , R4
13
13
MOVD n + 16 (FP) , R5
14
- CMP R5 , $ 0
15
- BNE check
16
- RET
17
14
15
+ // Determine if there are doublewords to
16
+ // copy so a more efficient move can be done
18
17
check:
19
- ANDCC $ 7 , R5 , R7 // R7 is the number of bytes to copy and CR0 [ EQ ] is set if there are none.
20
- SRAD $ 3 , R5 , R6 // R6 is the number of words to copy
21
- CMP R6 , $ 0 , CR1 // CR1 [ EQ ] is set if there are no words to copy.
22
-
23
- CMP R3 , R4 , CR2
24
- BC 12 , 9 , backward // I think you should be able to write this as "BGT CR2, backward"
18
+ ANDCC $ 7 , R5 , R7 // R7: bytes to copy
19
+ SRAD $ 3 , R5 , R6 // R6: double words to copy
20
+ CMP R6 , $ 0 , CR1 // CR1 [ EQ ] set if no double words to copy
25
21
26
- // Copying forward proceeds by copying R6 words then copying R7 bytes.
27
- // R3 and R4 are advanced as we copy. Because PPC64 lacks post - increment
28
- // load/store , R3 and R4 point before the bytes th at are to be copied.
22
+ // Determine overlap by subtracting dest - src and comparing against the
23
+ // length. The catches the cases where src and dest are in different types
24
+ // of storage such as stack and static to avoid doing backward move when not
25
+ // necessary.
29
26
30
- BC 12 , 6 , noforwardlarge // "BEQ CR1, noforwardlarge"
31
-
32
- MOVD R6 , CTR
27
+ SUB R4 , R3 , R8 // dest - src
28
+ CMPU R8 , R5 , CR2 // < len?
29
+ BC 12 , 8 , backward // BLT CR2 backward
33
30
34
- SUB $ 8 , R3
35
- SUB $ 8 , R4
31
+ // Copying forward if no overlap.
36
32
37
- forwardlargeloop:
38
- MOVDU 8 (R4) , R8
39
- MOVDU R8 , 8 (R3)
40
- BC 16 , 0 , forwardlargeloop // "BDNZ"
41
-
42
- ADD $ 8 , R3
43
- ADD $ 8 , R4
33
+ BC 12 , 6 , noforwardlarge // "BEQ CR1, noforwardlarge"
34
+ MOVD R6 , CTR // R6 = number of double words
35
+ SRADCC $ 2 , R6 , R8 // 32 byte chunks?
36
+ BNE forward32setup //
37
+
38
+ // Move double words
39
+
40
+ forward8:
41
+ MOVD 0 (R4) , R8 // double word
42
+ ADD $ 8 , R4
43
+ MOVD R8 , 0 (R3) //
44
+ ADD $ 8 , R3
45
+ BC 16 , 0 , forward8
46
+ BR noforwardlarge // handle remainder
47
+
48
+ // Prepare for moves of 32 bytes at a time.
49
+
50
+ forward32setup:
51
+ DCBTST (R3) // prepare data cache
52
+ DCBT (R4)
53
+ MOVD R8 , CTR // double work count
54
+
55
+ forward32:
56
+ MOVD 0 (R4) , R8 // load 4 double words
57
+ MOVD 8 (R4) , R9
58
+ MOVD 16 (R4) , R14
59
+ MOVD 24 (R4) , R15
60
+ ADD $ 32 , R4
61
+ MOVD R8 , 0 (R3) // store those 4
62
+ MOVD R9 , 8 (R3)
63
+ MOVD R14 , 16 (R3)
64
+ MOVD R15 , 24 (R3)
65
+ ADD $ 32 , R3 // bump up for next set
66
+ BC 16 , 0 , forward32 // continue
67
+ RLDCLCC $ 61 , R5 , $ 3 , R6 // remaining doublewords
68
+ BEQ noforwardlarge
69
+ MOVD R6 , CTR // set up the CTR
70
+ BR forward8
44
71
45
72
noforwardlarge:
46
- BNE forwardtail // Tests the bit set by ANDCC above
47
- RET
73
+ CMP R7 , $ 0 // any remaining bytes
74
+ BC 4 , 1 , LR
48
75
49
76
forwardtail:
50
- SUB $ 1 , R3
51
- SUB $ 1 , R4
52
- MOVD R7 , CTR
77
+ MOVD R7 , CTR // move tail bytes
53
78
54
79
forwardtailloop:
55
- MOVBZU 1 (R4) , R8
56
- MOVBZU R8 , 1 (R3)
80
+ MOVBZ 0 (R4) , R8 // move single bytes
81
+ ADD $ 1 , R4
82
+ MOVBZ R8 , 0 (R3)
83
+ ADD $ 1 , R3
57
84
BC 16 , 0 , forwardtailloop
58
85
RET
59
86
60
87
backward:
61
- // Copying backwards proceeds by copying R7 bytes then copying R6 words.
88
+ // Copying backwards proceeds by copying R7 bytes then copying R6 double words.
62
89
// R3 and R4 are advanced to the end of the destination/source buffers
63
90
// respectively and moved back as we copy.
64
91
65
- ADD R5 , R4 , R4
66
- ADD R3 , R5 , R3
92
+ ADD R5 , R4 , R4 // end of source
93
+ ADD R3 , R5 , R3 // end of dest
67
94
68
- BEQ nobackwardtail
95
+ BEQ nobackwardtail // earlier condition
69
96
70
- MOVD R7 , CTR
97
+ MOVD R7 , CTR // bytes to move
71
98
72
99
backwardtailloop:
73
- MOVBZU - 1 (R4) , R8
74
- MOVBZU R8 , - 1 (R3)
100
+ MOVBZ - 1 (R4) , R8 // point to last byte
101
+ SUB $ 1 , R4
102
+ MOVBZ R8 , - 1 (R3)
103
+ SUB $ 1 , R3
75
104
BC 16 , 0 , backwardtailloop
76
105
77
106
nobackwardtail:
78
- BC 4 , 6 , backwardlarge // "BNE CR1"
79
- RET
107
+ CMP R6 , $ 0
108
+ BC 4 , 5 , LR
80
109
81
110
backwardlarge:
82
111
MOVD R6 , CTR
83
112
84
113
backwardlargeloop:
85
- MOVDU - 8 (R4) , R8
86
- MOVDU R8 , - 8 (R3)
87
- BC 16 , 0 , backwardlargeloop // "BDNZ"
114
+ MOVD - 8 (R4) , R8
115
+ SUB $ 8 , R4
116
+ MOVD R8 , - 8 (R3)
117
+ SUB $ 8 , R3
118
+ BC 16 , 0 , backwardlargeloop //
88
119
RET
0 commit comments