@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
13
13
MOVD b+16 (FP), R5 // R5 = b
14
14
MOVD n+24 (FP), R6 // R6 = n
15
15
16
- CMPU R6 , $ 32 , CR7 // Check if n ≥ 32 bytes
16
+ CMPU R6, $64 , CR7 // Check if n ≥ 64 bytes
17
17
MOVD R0, R8 // R8 = index
18
- CMPU R6 , $ 8 , CR6 // Check if 8 ≤ n < 32 bytes
19
- BLT CR6 , small // Smaller than 8
20
- BLT CR7 , xor16 // Case for 16 ≤ n < 32 bytes
18
+ CMPU R6, $8 , CR6 // Check if 8 ≤ n < 64 bytes
19
+ BLE CR6, small // <= 8
20
+ BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
21
21
22
- // Case for n ≥ 32 bytes
23
- preloop32 :
24
- SRD $ 5 , R6 , R7 // Setup loop counter
22
+ // Case for n ≥ 64 bytes
23
+ preloop64 :
24
+ SRD $6 , R6, R7 // Set up loop counter
25
25
MOVD R7, CTR
26
26
MOVD $16 , R10
27
- ANDCC $ 31 , R6 , R9 // Check for tailing bytes for later
28
- loop32:
29
- LXVD2X (R4)( R8 ) , VS32 // VS32 = a [ i , ... , i + 15 ]
30
- LXVD2X (R4)( R10 ) , VS34
31
- LXVD2X (R5)( R8 ) , VS33 // VS33 = b [ i , ... , i + 15 ]
32
- LXVD2X (R5)( R10 ) , VS35
33
- XXLXOR VS32 , VS33 , VS32 // VS34 = a [] ^ b []
34
- XXLXOR VS34 , VS35 , VS34
35
- STXVD2X VS32 , (R3)( R8 ) // Store to dst
36
- STXVD2X VS34 , (R3)( R10 )
37
- ADD $ 32 , R8 // Update index
38
- ADD $ 32 , R10
39
- BC 16 , 0 , loop32 // bdnz loop16
40
-
41
- BEQ CR0 , done
42
-
43
- MOVD R9 , R6
44
- CMP R6 , $ 8
45
- BLT small
27
+ MOVD $32 , R14
28
+ MOVD $48 , R15
29
+ ANDCC $63 , R6, R9 // Check for tailing bytes for later
30
+ PCALIGN $16
31
+ // Case for >= 64 bytes
32
+ // Process 64 bytes per iteration
33
+ // Load 4 vectors of a and b
34
+ // XOR the corresponding vectors
35
+ // from a and b and store the result
36
+ loop64:
37
+ LXVD2X (R4)(R8), VS32
38
+ LXVD2X (R4)(R10), VS34
39
+ LXVD2X (R4)(R14), VS36
40
+ LXVD2X (R4)(R15), VS38
41
+ LXVD2X (R5)(R8), VS33
42
+ LXVD2X (R5)(R10), VS35
43
+ LXVD2X (R5)(R14), VS37
44
+ LXVD2X (R5)(R15), VS39
45
+ XXLXOR VS32, VS33, VS32
46
+ XXLXOR VS34, VS35, VS34
47
+ XXLXOR VS36, VS37, VS36
48
+ XXLXOR VS38, VS39, VS38
49
+ STXVD2X VS32, (R3)(R8)
50
+ STXVD2X VS34, (R3)(R10)
51
+ STXVD2X VS36, (R3)(R14)
52
+ STXVD2X VS38, (R3)(R15)
53
+ ADD $64 , R8
54
+ ADD $64 , R10
55
+ ADD $64 , R14
56
+ ADD $64 , R15
57
+ BDNZ loop64
58
+ BC 12 ,2 ,LR // BEQLR
59
+ MOVD R9, R6
60
+ CMP R6, $8
61
+ BLE small
62
+ // Case for 8 <= n < 64 bytes
63
+ // Process 32 bytes if available
64
+ xor32:
65
+ CMP R6, $32
66
+ BLT xor16
67
+ ADD $16 , R8, R9
68
+ LXVD2X (R4)(R8), VS32
69
+ LXVD2X (R4)(R9), VS33
70
+ LXVD2X (R5)(R8), VS34
71
+ LXVD2X (R5)(R9), VS35
72
+ XXLXOR VS32, VS34, VS32
73
+ XXLXOR VS33, VS35, VS33
74
+ STXVD2X VS32, (R3)(R8)
75
+ STXVD2X VS33, (R3)(R9)
76
+ ADD $32 , R8
77
+ ADD $-32 , R6
78
+ CMP R6, $8
79
+ BLE small
80
+ // Case for 8 <= n < 32 bytes
81
+ // Process 16 bytes if available
46
82
xor16:
47
- CMP R6 , $ 16
48
- BLT xor8
49
- LXVD2X (R4)( R8 ) , VS32
50
- LXVD2X (R5)( R8 ) , VS33
51
- XXLXOR VS32 , VS33 , VS32
52
- STXVD2X VS32 , (R3)( R8 )
53
- ADD $ 16 , R8
54
- ADD $ - 16 , R6
55
- CMP R6 , $ 8
56
- BLT small
83
+ CMP R6, $16
84
+ BLT xor8
85
+ LXVD2X (R4)(R8), VS32
86
+ LXVD2X (R5)(R8), VS33
87
+ XXLXOR VS32, VS33, VS32
88
+ STXVD2X VS32, (R3)(R8)
89
+ ADD $16 , R8
90
+ ADD $-16 , R6
91
+ small:
92
+ CMP R6, R0
93
+ BC 12 ,2 ,LR // BEQLR
57
94
xor8:
58
- // Case for 8 ≤ n < 16 bytes
59
- MOVD (R4)( R8 ) , R14 // R14 = a [ i , ... , i + 7 ]
60
- MOVD (R5)( R8 ) , R15 // R15 = b [ i , ... , i + 7 ]
61
- XOR R14 , R15 , R16 // R16 = a [] ^ b []
62
- SUB $ 8 , R6 // n = n - 8
63
- MOVD R16 , (R3)( R8 ) // Store to dst
64
- ADD $ 8 , R8
65
-
66
- // Check if we're finished
67
- CMP R6 , R0
68
- BGT small
95
+ #ifdef GOPPC64_power10
96
+ SLD $56 ,R6,R17
97
+ ADD R4,R8,R18
98
+ ADD R5,R8,R19
99
+ ADD R3,R8,R20
100
+ LXVL R18,R17,V0
101
+ LXVL R19,R17,V1
102
+ VXOR V0,V1,V1
103
+ STXVL V1,R20,R17
69
104
RET
70
-
71
- // Case for n < 8 bytes and tailing bytes from the
72
- // previous cases.
73
- small:
105
+ #else
106
+ CMP R6, $8
107
+ BLT xor4
108
+ // Case for 8 ≤ n < 16 bytes
109
+ MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
110
+ MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
111
+ XOR R14, R15, R16 // R16 = a[] ^ b[]
112
+ SUB $8 , R6 // n = n - 8
113
+ MOVD R16, (R3)(R8) // Store to dst
114
+ ADD $8 , R8
115
+ xor4:
116
+ CMP R6, $4
117
+ BLT xor2
118
+ MOVWZ (R4)(R8), R14
119
+ MOVWZ (R5)(R8), R15
120
+ XOR R14, R15, R16
121
+ MOVW R16, (R3)(R8)
122
+ ADD $4 ,R8
123
+ ADD $-4 ,R6
124
+ xor2:
125
+ CMP R6, $2
126
+ BLT xor1
127
+ MOVHZ (R4)(R8), R14
128
+ MOVHZ (R5)(R8), R15
129
+ XOR R14, R15, R16
130
+ MOVH R16, (R3)(R8)
131
+ ADD $2 ,R8
132
+ ADD $-2 ,R6
133
+ xor1:
74
134
CMP R6, R0
75
- BEQ done
76
- MOVD R6 , CTR // Setup loop counter
77
-
78
- loop :
135
+ BC 12 ,2 ,LR // BEQLR
79
136
MOVBZ (R4)(R8), R14 // R14 = a[i]
80
137
MOVBZ (R5)(R8), R15 // R15 = b[i]
81
138
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
82
139
MOVB R16, (R3)(R8) // Store to dst
83
- ADD $ 1 , R8
84
- BC 16 , 0 , loop // bdnz loop
85
-
140
+ #endif
86
141
done:
87
142
RET
0 commit comments