diff --git a/src/crypto/cipher/xor_arm.go b/src/crypto/cipher/xor_arm.go new file mode 100644 index 00000000000000..b24f178b0ccb3b --- /dev/null +++ b/src/crypto/cipher/xor_arm.go @@ -0,0 +1,58 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cipher + +import ( + "internal/cpu" + "unsafe" +) + +const wordSize = int(unsafe.Sizeof(uintptr(0))) + +var hasNEON = cpu.HWCap&(1<<12) != 0 + +func isAligned(a *byte) bool { + return uintptr(unsafe.Pointer(a))%uintptr(wordSize) == 0 +} + +// xorBytes xors the bytes in a and b. The destination should have enough +// space, otherwise xorBytes will panic. Returns the number of bytes xor'd. +func xorBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + if n == 0 { + return 0 + } + // make sure dst has enough space + _ = dst[n-1] + + if hasNEON { + xorBytesNEON32(&dst[0], &a[0], &b[0], n) + } else if isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0]) { + xorBytesARM32(&dst[0], &a[0], &b[0], n) + } else { + safeXORBytes(dst, a, b, n) + } + return n +} + +// n needs to be smaller or equal than the length of a and b. +func safeXORBytes(dst, a, b []byte, n int) { + for i := 0; i < n; i++ { + dst[i] = a[i] ^ b[i] + } +} + +func xorWords(dst, a, b []byte) { + xorBytes(dst, a, b) +} + +//go:noescape +func xorBytesARM32(dst, a, b *byte, n int) + +//go:noescape +func xorBytesNEON32(dst, a, b *byte, n int) diff --git a/src/crypto/cipher/xor_arm.s b/src/crypto/cipher/xor_arm.s new file mode 100644 index 00000000000000..7b471f12b8154a --- /dev/null +++ b/src/crypto/cipher/xor_arm.s @@ -0,0 +1,114 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func xorBytesARM32(dst, a, b *byte, n int) +TEXT ·xorBytesARM32(SB), NOSPLIT|NOFRAME, $0 + MOVW dst+0(FP), R0 + MOVW a+4(FP), R1 + MOVW b+8(FP), R2 + MOVW n+12(FP), R3 + CMP $4, R3 + BLT less_than4 + +loop_4: + MOVW.P 4(R1), R4 + MOVW.P 4(R2), R5 + EOR R4, R5, R5 + MOVW.P R5, 4(R0) + + SUB $4, R3 + CMP $4, R3 + BGE loop_4 + +less_than4: + CMP $2, R3 + BLT less_than2 + MOVH.P 2(R1), R4 + MOVH.P 2(R2), R5 + EOR R4, R5, R5 + MOVH.P R5, 2(R0) + + SUB $2, R3 + +less_than2: + CMP $0, R3 + BEQ end + MOVB (R1), R4 + MOVB (R2), R5 + EOR R4, R5, R5 + MOVB R5, (R0) +end: + RET + +// func xorBytesNEON32(dst, a, b *byte, n int) +TEXT ·xorBytesNEON32(SB), NOSPLIT|NOFRAME, $0 + MOVW dst+0(FP), R0 + MOVW a+4(FP), R1 + MOVW b+8(FP), R2 + MOVW n+12(FP), R3 + CMP $32, R3 + BLT less_than32 + +loop_32: + WORD $0xF421020D // vld1.u8 {q0, q1}, [r1]! + WORD $0xF422420D // vld1.u8 {q2, q3}, [r2]! + WORD $0xF3004154 // veor q2, q0, q2 + WORD $0xF3026156 // veor q3, q1, q3 + WORD $0xF400420D // vst1.u8 {q2, q3}, [r0]! + + SUB $32, R3 + CMP $32, R3 + BGE loop_32 + +less_than32: + CMP $16, R3 + BLT less_than16 + WORD $0xF4210A0D // vld1.u8 q0, [r1]! + WORD $0xF4222A0D // vld1.u8 q1, [r2]! + WORD $0xF3002152 // veor q1, q0, q1 + WORD $0xF4002A0D // vst1.u8 {q1}, [r0]! + + SUB $16, R3 + +less_than16: + CMP $8, R3 + BLT less_than8 + WORD $0xF421070D // vld1.u8 d0, [r1]! + WORD $0xF422170D // vld1.u8 d1, [r2]! + WORD $0xF3001111 // veor d1, d0, d1 + WORD $0xF400170D // vst1.u8 {d1}, [r0]! + + SUB $8, R3 + +less_than8: + CMP $4, R3 + BLT less_than4 + MOVW.P 4(R1), R4 + MOVW.P 4(R2), R5 + EOR R4, R5, R5 + MOVW.P R5, 4(R0) + + SUB $4, R3 + +less_than4: + CMP $2, R3 + BLT less_than2 + MOVH.P 2(R1), R4 + MOVH.P 2(R2), R5 + EOR R4, R5, R5 + MOVH.P R5, 2(R0) + + SUB $2, R3 + +less_than2: + CMP $0, R3 + BEQ end + MOVB (R1), R4 + MOVB (R2), R5 + EOR R4, R5, R5 + MOVB R5, (R0) +end: + RET diff --git a/src/crypto/cipher/xor_generic.go b/src/crypto/cipher/xor_generic.go index 43517a8e20329b..6c26a63a5bc2b3 100644 --- a/src/crypto/cipher/xor_generic.go +++ b/src/crypto/cipher/xor_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !ppc64 && !ppc64le && !arm64 +//go:build !amd64 && !ppc64 && !ppc64le && !arm64 && !arm package cipher @@ -23,14 +23,9 @@ func xorBytes(dst, a, b []byte) int { } switch { - case supportsUnaligned: + case supportsUnaligned || (isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0])): fastXORBytes(dst, a, b, n) default: - // TODO(hanwen): if (dst, a, b) have common alignment - // we could still try fastXORBytes. It is not clear - // how often this happens, and it's only worth it if - // the block encryption itself is hardware - // accelerated. safeXORBytes(dst, a, b, n) } return n @@ -39,8 +34,12 @@ func xorBytes(dst, a, b []byte) int { const wordSize = int(unsafe.Sizeof(uintptr(0))) const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" +func isAligned(a *byte) bool { + return uintptr(unsafe.Pointer(a))%uintptr(wordSize) == 0 +} + // fastXORBytes xors in bulk. It only works on architectures that -// support unaligned read/writes. +// support unaligned read/writes, or if dst, a, b are all aligned. // n needs to be smaller or equal than the length of a and b. func fastXORBytes(dst, a, b []byte, n int) { // Assert dst has enough space @@ -83,7 +82,7 @@ func fastXORWords(dst, a, b []byte) { // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.) // The slice arguments a and b are assumed to be of equal length. func xorWords(dst, a, b []byte) { - if supportsUnaligned { + if supportsUnaligned || (isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0])) { fastXORWords(dst, a, b) } else { safeXORBytes(dst, a, b, len(b))