diff --git a/src/crypto/cipher/xor_arm.go b/src/crypto/cipher/xor_arm.go
new file mode 100644
index 00000000000000..b24f178b0ccb3b
--- /dev/null
+++ b/src/crypto/cipher/xor_arm.go
@@ -0,0 +1,58 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cipher
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+
+var hasNEON = cpu.HWCap&(1<<12) != 0
+
+func isAligned(a *byte) bool {
+	return uintptr(unsafe.Pointer(a))%uintptr(wordSize) == 0
+}
+
+// xorBytes xors the bytes in a and b. The destination should have enough
+// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if n == 0 {
+		return 0
+	}
+	// make sure dst has enough space
+	_ = dst[n-1]
+
+	if hasNEON {
+		xorBytesNEON32(&dst[0], &a[0], &b[0], n)
+	} else if isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0]) {
+		xorBytesARM32(&dst[0], &a[0], &b[0], n)
+	} else {
+		safeXORBytes(dst, a, b, n)
+	}
+	return n
+}
+
+// n needs to be smaller or equal than the length of a and b.
+func safeXORBytes(dst, a, b []byte, n int) {
+	for i := 0; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+}
+
+func xorWords(dst, a, b []byte) {
+	xorBytes(dst, a, b)
+}
+
+//go:noescape
+func xorBytesARM32(dst, a, b *byte, n int)
+
+//go:noescape
+func xorBytesNEON32(dst, a, b *byte, n int)
diff --git a/src/crypto/cipher/xor_arm.s b/src/crypto/cipher/xor_arm.s
new file mode 100644
index 00000000000000..7b471f12b8154a
--- /dev/null
+++ b/src/crypto/cipher/xor_arm.s
@@ -0,0 +1,114 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func xorBytesARM32(dst, a, b *byte, n int)
+TEXT ·xorBytesARM32(SB), NOSPLIT|NOFRAME, $0
+	MOVW	dst+0(FP), R0
+	MOVW	a+4(FP), R1
+	MOVW	b+8(FP), R2
+	MOVW	n+12(FP), R3
+	CMP	$4, R3
+	BLT	less_than4
+
+loop_4:
+	MOVW.P	4(R1), R4
+	MOVW.P	4(R2), R5
+	EOR	R4, R5, R5
+	MOVW.P	R5, 4(R0)
+
+	SUB	$4, R3
+	CMP	$4, R3
+	BGE	loop_4
+
+less_than4:
+	CMP	$2, R3
+	BLT	less_than2
+	MOVH.P	2(R1), R4
+	MOVH.P	2(R2), R5
+	EOR	R4, R5, R5
+	MOVH.P	R5, 2(R0)
+
+	SUB	$2, R3
+
+less_than2:
+	CMP	$0, R3
+	BEQ	end
+	MOVB	(R1), R4
+	MOVB	(R2), R5
+	EOR	R4, R5, R5
+	MOVB	R5, (R0)
+end:
+	RET
+
+// func xorBytesNEON32(dst, a, b *byte, n int)
+TEXT ·xorBytesNEON32(SB), NOSPLIT|NOFRAME, $0
+	MOVW	dst+0(FP), R0
+	MOVW	a+4(FP), R1
+	MOVW	b+8(FP), R2
+	MOVW	n+12(FP), R3
+	CMP	$32, R3
+	BLT	less_than32
+
+loop_32:
+	WORD	$0xF421020D // vld1.u8 {q0, q1}, [r1]!
+	WORD	$0xF422420D // vld1.u8 {q2, q3}, [r2]!
+	WORD	$0xF3004154 // veor q2, q0, q2
+	WORD	$0xF3026156 // veor q3, q1, q3
+	WORD	$0xF400420D // vst1.u8 {q2, q3}, [r0]!
+
+	SUB	$32, R3
+	CMP	$32, R3
+	BGE	loop_32
+
+less_than32:
+	CMP	$16, R3
+	BLT	less_than16
+	WORD	$0xF4210A0D // vld1.u8 q0, [r1]!
+	WORD	$0xF4222A0D // vld1.u8 q1, [r2]!
+	WORD	$0xF3002152 // veor q1, q0, q1
+	WORD	$0xF4002A0D // vst1.u8 {q1}, [r0]!
+
+	SUB	$16, R3
+
+less_than16:
+	CMP	$8, R3
+	BLT	less_than8
+	WORD	$0xF421070D // vld1.u8 d0, [r1]!
+	WORD	$0xF422170D // vld1.u8 d1, [r2]!
+	WORD	$0xF3001111 // veor d1, d0, d1
+	WORD	$0xF400170D // vst1.u8 {d1}, [r0]!
+
+	SUB	$8, R3
+
+less_than8:
+	CMP	$4, R3
+	BLT	less_than4
+	MOVW.P	4(R1), R4
+	MOVW.P	4(R2), R5
+	EOR	R4, R5, R5
+	MOVW.P	R5, 4(R0)
+
+	SUB	$4, R3
+
+less_than4:
+	CMP	$2, R3
+	BLT	less_than2
+	MOVH.P	2(R1), R4
+	MOVH.P	2(R2), R5
+	EOR	R4, R5, R5
+	MOVH.P	R5, 2(R0)
+
+	SUB	$2, R3
+
+less_than2:
+	CMP	$0, R3
+	BEQ	end
+	MOVB	(R1), R4
+	MOVB	(R2), R5
+	EOR	R4, R5, R5
+	MOVB	R5, (R0)
+end:
+	RET
diff --git a/src/crypto/cipher/xor_generic.go b/src/crypto/cipher/xor_generic.go
index 43517a8e20329b..6c26a63a5bc2b3 100644
--- a/src/crypto/cipher/xor_generic.go
+++ b/src/crypto/cipher/xor_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 && !ppc64 && !ppc64le && !arm64
+//go:build !amd64 && !ppc64 && !ppc64le && !arm64 && !arm
 
 package cipher
 
@@ -23,14 +23,9 @@ func xorBytes(dst, a, b []byte) int {
 	}
 
 	switch {
-	case supportsUnaligned:
+	case supportsUnaligned || (isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0])):
 		fastXORBytes(dst, a, b, n)
 	default:
-		// TODO(hanwen): if (dst, a, b) have common alignment
-		// we could still try fastXORBytes. It is not clear
-		// how often this happens, and it's only worth it if
-		// the block encryption itself is hardware
-		// accelerated.
 		safeXORBytes(dst, a, b, n)
 	}
 	return n
@@ -39,8 +34,12 @@ func xorBytes(dst, a, b []byte) int {
 const wordSize = int(unsafe.Sizeof(uintptr(0)))
 const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
 
+func isAligned(a *byte) bool {
+	return uintptr(unsafe.Pointer(a))%uintptr(wordSize) == 0
+}
+
 // fastXORBytes xors in bulk. It only works on architectures that
-// support unaligned read/writes.
+// support unaligned read/writes, or if dst, a, b are all aligned.
 // n needs to be smaller or equal than the length of a and b.
 func fastXORBytes(dst, a, b []byte, n int) {
 	// Assert dst has enough space
@@ -83,7 +82,7 @@ func fastXORWords(dst, a, b []byte) {
 // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
 // The slice arguments a and b are assumed to be of equal length.
 func xorWords(dst, a, b []byte) {
-	if supportsUnaligned {
+	if supportsUnaligned || (isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0])) {
 		fastXORWords(dst, a, b)
 	} else {
 		safeXORBytes(dst, a, b, len(b))