From ac0378150bf3f1798995d6e17fbdb34fbea81e8c Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Mon, 25 Dec 2023 17:40:23 -0800 Subject: [PATCH 01/12] Add macOS .DS_Store files to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index efcf6e5..48c374b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ actual/ /docs/site/ /deps/aes-ni Manifest.toml +.DS_Store From 764d71cd0774ab4f9e8be1f1b0e0e7edf8011baf Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Mon, 25 Dec 2023 18:17:13 -0800 Subject: [PATCH 02/12] Isolate AES support for x86 --- src/Random123.jl | 22 ++++++++++------- src/{ => x86}/aesni.jl | 0 src/{ => x86}/aesni_common.jl | 0 src/{ => x86}/ars.jl | 0 test/runtests.jl | 45 +++++++++++++++++++++-------------- test/{ => x86}/aesni.jl | 4 ---- test/{ => x86}/ars.jl | 4 ---- 7 files changed, 41 insertions(+), 34 deletions(-) rename src/{ => x86}/aesni.jl (100%) rename src/{ => x86}/aesni_common.jl (100%) rename src/{ => x86}/ars.jl (100%) rename test/{ => x86}/aesni.jl (96%) rename test/{ => x86}/ars.jl (97%) diff --git a/src/Random123.jl b/src/Random123.jl index 98f3aad..f1053a1 100644 --- a/src/Random123.jl +++ b/src/Random123.jl @@ -29,10 +29,8 @@ include("philox.jl") export R123_USE_AESNI -"True when AES-NI has been enabled." -const R123_USE_AESNI = @static if Sys.isapple() && Sys.ARCH ≡ :aarch64 - false -else +"True when x86 AES-NI instructiona have been detected." +const R123_USE_X86_AES_NI::Bool = @static if Sys.ARCH ≡ :x86_64 || Sys.ARCH ≡ :i686 try cmd = Base.julia_cmd() push!( @@ -47,16 +45,24 @@ else catch e false end +else + false end +"True when AES-acceleration instructions have been detected." +const R123_USE_AESNI::Bool = R123_USE_X86_AES_NI + @static if R123_USE_AESNI export AESNI1x, AESNI4x, aesni export ARS1x, ARS4x, ars - include("./aesni_common.jl") - include("./aesni.jl") - include("./ars.jl") else - @warn "AES-NI instruction set is not enabled, so the related RNGs (AESNI and ARS) are not available." + @warn "AES-acceleration instructions have not been detected, so the related RNGs (AESNI and ARS) are not available." +end + +@static if R123_USE_X86_AES_NI + include("./x86/aesni_common.jl") + include("./x86/aesni.jl") + include("./x86/ars.jl") end end diff --git a/src/aesni.jl b/src/x86/aesni.jl similarity index 100% rename from src/aesni.jl rename to src/x86/aesni.jl diff --git a/src/aesni_common.jl b/src/x86/aesni_common.jl similarity index 100% rename from src/aesni_common.jl rename to src/x86/aesni_common.jl diff --git a/src/ars.jl b/src/x86/ars.jl similarity index 100% rename from src/ars.jl rename to src/x86/ars.jl diff --git a/test/runtests.jl b/test/runtests.jl index 17000d3..12b7a14 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,7 @@ using Printf: @printf seed1 = 1 seed2 = (1,2) seed4 = (1,2,3,4) - for (rng, alg, options) in [ + alg_choices = [ (Threefry2x(UInt32, seed2) , threefry, (Val(20),)) , (Threefry2x(UInt64, seed2) , threefry, (Val(20),)) , (Threefry4x(UInt32, seed4) , threefry, (Val(20),)) , @@ -21,11 +21,16 @@ using Printf: @printf (Philox2x(UInt64 , seed1) , philox , (Val(10),)) , (Philox4x(UInt32 , seed2) , philox , (Val(10),)) , (Philox4x(UInt64 , seed2) , philox , (Val(10),)) , - (AESNI1x(seed1) , aesni , () ) , - (AESNI4x(seed4) , aesni , () ) , - (ARS1x(seed1) , ars , (Val(7),) ) , - (ARS4x(seed4) , ars , (Val(7),) ) , ] + if R123_USE_AESNI + append!(alg_choices, [ + (AESNI1x(seed1) , aesni , () ) , + (AESNI4x(seed4) , aesni , () ) , + (ARS1x(seed1) , ars , (Val(7),) ) , + (ARS4x(seed4) , ars , (Val(7),) ) , + ]) + end + for (rng, alg, options) in alg_choices key = @inferred get_key(rng) ctr = @inferred get_ctr(rng) @test isbitstype(typeof(key)) @@ -89,17 +94,19 @@ end @test x9 === y9 end - rng = ARS1x(1) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) - - rng = AESNI1x(1) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) - @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + if R123_USE_AESNI + rng = ARS1x(1) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + @test (rand(rng, UInt128),) === ars(get_key(rng), get_ctr(rng), Val(7)) + + rng = AESNI1x(1) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + @test (rand(rng, UInt128),) === aesni(get_key(rng), get_ctr(rng)) + end end @@ -164,5 +171,7 @@ redirect_stdout(stdout_) compare_dirs("expected", "actual") cd(pwd_) -include("aesni.jl") -include("ars.jl") +if Random123.R123_USE_X86_AES_NI + include("./x86/aesni.jl") + include("./x86/ars.jl") +end diff --git a/test/aesni.jl b/test/x86/aesni.jl similarity index 96% rename from test/aesni.jl rename to test/x86/aesni.jl index 7fc9ab4..79d1129 100644 --- a/test/aesni.jl +++ b/test/x86/aesni.jl @@ -1,5 +1,3 @@ -if R123_USE_AESNI - import Random: seed! using Test: @test @@ -26,5 +24,3 @@ r1 = AESNI4x(split_uint(key, UInt32)) set_counter!(r, 0) set_counter!(r1, 1) @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) - -end diff --git a/test/ars.jl b/test/x86/ars.jl similarity index 97% rename from test/ars.jl rename to test/x86/ars.jl index 648687d..1cbd9c9 100644 --- a/test/ars.jl +++ b/test/x86/ars.jl @@ -1,5 +1,3 @@ -if R123_USE_AESNI - import Random: seed! using Test: @test @@ -29,5 +27,3 @@ r1 = ARS4x(split_uint(key, UInt32)) set_counter!(r, 0) set_counter!(r1, 1) @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) - -end From c60b0e28b8889a5a6eb353a614c83871e5437f88 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Mon, 25 Dec 2023 21:08:14 -0800 Subject: [PATCH 03/12] Introduce AES support for AArch64 --- src/Random123.jl | 29 ++++- src/aarch64/aesni.jl | 252 ++++++++++++++++++++++++++++++++++++ src/aarch64/aesni_common.jl | 149 +++++++++++++++++++++ src/aarch64/ars.jl | 163 +++++++++++++++++++++++ test/aarch64/aesni.jl | 26 ++++ test/aarch64/ars.jl | 29 +++++ test/runtests.jl | 8 +- 7 files changed, 653 insertions(+), 3 deletions(-) create mode 100644 src/aarch64/aesni.jl create mode 100644 src/aarch64/aesni_common.jl create mode 100644 src/aarch64/ars.jl create mode 100644 test/aarch64/aesni.jl create mode 100644 test/aarch64/ars.jl diff --git a/src/Random123.jl b/src/Random123.jl index f1053a1..ea67910 100644 --- a/src/Random123.jl +++ b/src/Random123.jl @@ -49,8 +49,31 @@ else false end +"True when AArch64 FEAT_AES intrinsics have been detected." +const R123_USE_AARCH64_FEAT_AES::Bool = if Sys.ARCH ≡ :aarch64 + try + cmd = Base.julia_cmd() + push!( + cmd.exec, + "-e", + "const uint8x16 = NTuple{16, VecElement{UInt8}};" * + "@assert ccall(\"llvm.aarch64.crypto.aesmc\", " * + "llvmcall, uint8x16, (uint8x16,), " * + "uint8x16((0x4a, 0x68, 0xbd, 0xe1, 0xfe, 0x16, 0x3d, " * + "0xec, 0xde, 0x06, 0x72, 0x86, 0xe3, 0x8c, 0x14, 0xd9))) ≡ " * + "uint8x16((0x70, 0xa7, 0x7b, 0xd2, 0x0c, 0x79, 0xbd, " * + "0xf1, 0x59, 0xc2, 0xad, 0x1a, 0x9f, 0x05, 0x37, 0x0f))", + ) + success(cmd) + catch e + false + end +else + false +end + "True when AES-acceleration instructions have been detected." -const R123_USE_AESNI::Bool = R123_USE_X86_AES_NI +const R123_USE_AESNI::Bool = R123_USE_X86_AES_NI || R123_USE_AARCH64_FEAT_AES @static if R123_USE_AESNI export AESNI1x, AESNI4x, aesni @@ -63,6 +86,10 @@ end include("./x86/aesni_common.jl") include("./x86/aesni.jl") include("./x86/ars.jl") +elseif R123_USE_AARCH64_FEAT_AES + include("./aarch64/aesni_common.jl") + include("./aarch64/aesni.jl") + include("./aarch64/ars.jl") end end diff --git a/src/aarch64/aesni.jl b/src/aarch64/aesni.jl new file mode 100644 index 0000000..b93a52e --- /dev/null +++ b/src/aarch64/aesni.jl @@ -0,0 +1,252 @@ +import Base: copy, copyto!, ==, llvmcall +import Random: rand, seed! +import RandomNumbers: gen_seed, union_uint, seed_type, unsafe_copyto!, unsafe_compare + + +"The key for AESNI." +mutable struct AESNIKey + key1::uint64x2 + key2::uint64x2 + key3::uint64x2 + key4::uint64x2 + key5::uint64x2 + key6::uint64x2 + key7::uint64x2 + key8::uint64x2 + key9::uint64x2 + key10::uint64x2 + key11::uint64x2 + AESNIKey() = new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) +end + +copyto!(dest::AESNIKey, src::AESNIKey) = unsafe_copyto!(dest, src, UInt128, 11) + +copy(src::AESNIKey) = copyto!(AESNIKey(), src) + +==(key1::AESNIKey, key2::AESNIKey) = unsafe_compare(key1, key2, UInt128, 11) + +""" +Assistant function for AES128. Originally compiled for x86 from the C++ source code: +```cpp +R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { + uint64x2 temp3; + temp2 = _mm_shuffle_epi32 (temp2 ,0xff); + temp3 = _mm_slli_si128 (temp1, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp1 = _mm_xor_si128 (temp1, temp2); + return temp1; +} +``` +Then made architecture-agnostic as LLVM IR. +""" +_aes_128_assist(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = bitcast <2 x i64> %1 to <4 x i32> + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> + %5 = bitcast <4 x i32> %4 to <2 x i64> + %6 = bitcast <2 x i64> %0 to <16 x i8> + %7 = shufflevector <16 x i8> , <16 x i8> %6, <16 x i32> + %8 = bitcast <16 x i8> %7 to <2 x i64> + %9 = xor <2 x i64> %8, %0 + %10 = shufflevector <16 x i8> , <16 x i8> %7, <16 x i32> + %11 = bitcast <16 x i8> %10 to <2 x i64> + %12 = xor <2 x i64> %9, %11 + %13 = shufflevector <16 x i8> , <16 x i8> %10, <16 x i32> + %14 = bitcast <16 x i8> %13 to <2 x i64> + %15 = xor <2 x i64> %12, %5 + %16 = xor <2 x i64> %15, %14 + ret <2 x i64> %16""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data +) |> uint64x2 + +function _aesni_expand!(k::AESNIKey, rkey::uint64x2) + k.key1 = rkey + tmp = _aes_key_gen_assist(rkey, Val(0x1)) + rkey = _aes_128_assist(rkey, tmp) + k.key2 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x2)) + rkey = _aes_128_assist(rkey, tmp) + k.key3 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x4)) + rkey = _aes_128_assist(rkey, tmp) + k.key4 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x8)) + rkey = _aes_128_assist(rkey, tmp) + k.key5 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x10)) + rkey = _aes_128_assist(rkey, tmp) + k.key6 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x20)) + rkey = _aes_128_assist(rkey, tmp) + k.key7 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x40)) + rkey = _aes_128_assist(rkey, tmp) + k.key8 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x80)) + rkey = _aes_128_assist(rkey, tmp) + k.key9 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x1b)) + rkey = _aes_128_assist(rkey, tmp) + k.key10 = rkey + + tmp = _aes_key_gen_assist(rkey, Val(0x36)) + rkey = _aes_128_assist(rkey, tmp) + k.key11 = rkey + + k +end + +AESNIKey(key::UInt128) = _aesni_expand!(AESNIKey(), uint64x2(key)) + +""" +```julia +AESNI1x <: AbstractAESNI1x +AESNI1x([seed]) +``` + +AESNI1x is one kind of AESNI Counter-Based RNGs. It generates one `UInt128` number at a time. + +`seed` is an `Integer` which will be automatically converted to `UInt128`. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct AESNI1x <: AbstractAESNI1x + x::uint64x2 + ctr::uint64x2 + key::AESNIKey +end + +function AESNI1x(seed::Integer=gen_seed(UInt128)) + r = AESNI1x(0, 0, AESNIKey()) + seed!(r, seed) + r +end + +function seed!(r::AESNI1x, seed::Integer=gen_seed(UInt128)) + r.x = zero(uint64x2) + r.ctr = zero(uint64x2) + _aesni_expand!(r.key, uint64x2(seed % UInt128)) + random123_r(r) + r +end + +seed_type(::Type{AESNI1x}) = UInt128 + +function copyto!(dest::AESNI1x, src::AESNI1x) + dest.x = src.x + dest.ctr = src.ctr + copyto!(dest.key, src.key) + dest +end + +copy(src::AESNI1x) = copyto!(AESNI1x(), src) + +==(r1::AESNI1x, r2::AESNI1x) = r1.x == r2.x && r1.key == r2.key && r1.ctr == r2.ctr + +""" +```julia +AESNI4x <: AbstractAESNI4x +AESNI4x([seed]) +``` + +AESNI4x is one kind of AESNI Counter-Based RNGs. It generates four `UInt32` numbers at a time. + +`seed` is a `Tuple` of four `Integer`s which will all be automatically converted to `UInt32`. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct AESNI4x <: AbstractAESNI4x + x::uint64x2 + ctr1::uint64x2 + key::AESNIKey + p::Int +end + +function AESNI4x(seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + r = AESNI4x(zero(uint64x2), zero(uint64x2), AESNIKey(), 0) + seed!(r, seed) + r +end + +function seed!(r::AESNI4x, seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + key = union_uint(Tuple(x % UInt32 for x in seed)) + r.ctr1 = 0 + _aesni_expand!(r.key, uint64x2(key)) + r.p = 0 + random123_r(r) + r +end + +seed_type(::Type{AESNI4x}) = NTuple{4, UInt32} + +function copyto!(dest::AESNI4x, src::AESNI4x) + unsafe_copyto!(dest, src, UInt128, 2) + copyto!(dest.key, src.key) + dest.p = src.p + dest +end + +copy(src::AESNI4x) = copyto!(AESNI4x(), src) +==(r1::AESNI4x, r2::AESNI4x) = unsafe_compare(r1, r2, UInt128, 2) && + r1.key == r2.key && r1.p == r2.p + +function get_key_uint64x2(o::Union{AESNI1x, AESNI4x})::NTuple{11, uint64x2} + k = o.key + (k.key1,k.key2,k.key3,k.key4,k.key5,k.key6,k.key7,k.key8,k.key9,k.key10,k.key11) +end +get_ctr_uint64x2(o::AESNI4x)::Tuple{uint64x2} = (o.ctr1,) +get_ctr_uint64x2(o::AESNI1x)::Tuple{uint64x2} = (o.ctr,) +get_key(o::Union{AESNI1x, AESNI4x})::NTuple{11,UInt128} = map(UInt128, get_key_uint64x2(o)) +get_ctr(o::Union{AESNI1x, AESNI4x})::Tuple{UInt128} = map(UInt128, get_ctr_uint64x2(o)) + +@inline function aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2} + key1, key2, key3, key4, key5, key6, key7, key8, key9, key10, key11 = key + ctr1 = only(ctr) + x = key1 ⊻ ctr1 + x = _aes_enc(x, key2) + x = _aes_enc(x, key3) + x = _aes_enc(x, key4) + x = _aes_enc(x, key5) + x = _aes_enc(x, key6) + x = _aes_enc(x, key7) + x = _aes_enc(x, key8) + x = _aes_enc(x, key9) + x = _aes_enc(x, key10) + x = _aes_enc_last(x, key11) + (x,) +end + +""" + aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128} + +Functional variant of [`AESNI1x`](@ref) and [`AESNI4x`](@ref). +This function if free of mutability and side effects. +""" +@inline function aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128} + k = map(uint64x2, key) + c = map(uint64x2, ctr) + map(UInt128,aesni(k,c)) +end + + +@inline function random123_r(r::AESNI1x) + r.x = only(aesni(get_key_uint64x2(r), get_ctr_uint64x2(r))) + (UInt128(r.x),) +end + +@inline function random123_r(r::AESNI4x) + r.x = only(aesni(get_key_uint64x2(r), get_ctr_uint64x2(r))) + split_uint(UInt128(r.x), UInt32) +end diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl new file mode 100644 index 0000000..11dfd82 --- /dev/null +++ b/src/aarch64/aesni_common.jl @@ -0,0 +1,149 @@ +using Base: llvmcall +import Base.(+) + +using ..Random123: R123Generator1x, R123Generator4x +import ..Random123: random123_r, set_counter! + +const LITTLE_ENDIAN::Bool = ENDIAN_BOM ≡ 0x04030201 + +const uint64x2_lvec = NTuple{2, VecElement{UInt64}} +struct uint64x2 + data::uint64x2_lvec +end +Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +UInt128(x::uint64x2) = convert(UInt128, x) +uint64x2(x::UInt128) = convert(uint64x2, x) +Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x)) +Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +uint64x2(hi::UInt64, lo::UInt64) = if LITTLE_ENDIAN + uint64x2((VecElement(lo), VecElement(hi))) +else + uint64x2((VecElement(hi), VecElement(lo))) +end + +Base.zero(::Type{uint64x2}) = convert(uint64x2, 0) +Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64)) +Base.xor(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = xor <2 x i64> %1, %0 + ret <2 x i64> %3""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data, +) |> uint64x2 +(+)(a::uint64x2, b::uint64x2) = llvmcall( + """%3 = add <2 x i64> %1, %0 + ret <2 x i64> %3""", + uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, + a.data, b.data, +) |> uint64x2 +(+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b)) + +const uint8x16_lvec = NTuple{16, VecElement{UInt8}} +struct uint8x16 + data::uint8x16_lvec +end +Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +uint8x16(x::uint64x2) = convert(uint8x16, x) +uint64x2(x::uint8x16) = convert(uint64x2, x) +Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +UInt128(x::uint8x16) = convert(UInt128, x) +uint8x16(x::UInt128) = convert(uint8x16, x) +Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x)) +Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +function uint8x16(bytes::Vararg{UInt8, 16}) + bytes_prepped = bytes + if LITTLE_ENDIAN + bytes_prepped = reverse(bytes_prepped) + end + bytes_vec::uint8x16_lvec = VecElement.(bytes_prepped) + return uint8x16(bytes_vec) +end + +Base.zero(::Type{uint8x16}) = convert(uint8x16, 0) +Base.xor(a::uint8x16, b::uint8x16) = llvmcall( + """%3 = xor <16 x i8> %1, %0 + ret <16 x i8> %3""", + uint8x16_lvec, Tuple{uint8x16_lvec, uint8x16_lvec}, + a.data, b.data, +) |> uint8x16 + +# Raw NEON instrinsics, provided by FEAT_AES +_vaese(a::uint8x16, b::uint8x16) = ccall( + "llvm.aarch64.crypto.aese", + llvmcall, + uint8x16_lvec, + (uint8x16_lvec, uint8x16_lvec), + a.data, b.data, +) |> uint8x16 +_vaesmc(a::uint8x16) = ccall( + "llvm.aarch64.crypto.aesmc", + llvmcall, + uint8x16_lvec, + (uint8x16_lvec,), + a.data, +) |> uint8x16 + +""" +Assistant function for AES keygen. Originally compiled for AArch64 from the C source code: +```cpp +uint8x16_t _mm_aeskeygenassist_helper(uint8x16_t a) +{ + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + a[0x4], a[0x1], a[0xE], a[0xB], // SubBytes(X1) + a[0x1], a[0xE], a[0xB], a[0x4], // ROT(SubBytes(X1)) + a[0xC], a[0x9], a[0x6], a[0x3], // SubBytes(X3) + a[0x9], a[0x6], a[0x3], a[0xC], // ROT(SubBytes(X3)) + }; + return dest; +} +``` +Then made architecture-agnostic as LLVM IR. +""" +_aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall( + """%2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %2""", + uint8x16_lvec, Tuple{uint8x16_lvec}, + a.data, +) |> uint8x16 + +# Mimics of the x86 AES-NI instrinsics +# +# Algorithm translations courtesy of the SIMD Everywhere and SSE2NEON projects: +# https://github.com/simd-everywhere/simde/blob/v0.8.0-rc1/simde/x86/aes.h +# https://github.com/DLTcollab/sse2neon/blob/v1.6.0/sse2neon.h +function _aes_enc(a::uint64x2, round_key::uint64x2) + res = _vaesmc(_vaese(uint8x16(a), zero(uint8x16))) + return uint64x2(res) ⊻ round_key +end +function _aes_enc_last(a::uint64x2, round_key::uint64x2) + res = _vaese(uint8x16(a), zero(uint8x16)) + return uint64x2(res) ⊻ round_key +end + +function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R} + res = _aes_key_gen_shuffle_helper(_vaese(uint8x16(a), zero(uint8x16))) + r = R % UInt64 + return uint64x2(res) ⊻ uint64x2(r, r) +end + +"Abstract RNG that generates one number at a time and is based on AESNI." +abstract type AbstractAESNI1x <: R123Generator1x{UInt128} end +"Abstract RNG that generates four numbers at a time and is based on AESNI." +abstract type AbstractAESNI4x <: R123Generator4x{UInt32} end + +@inline function set_counter!( + r::AbstractAESNI4x, + ctr::NTuple{4, Integer} +) + r.p = 0 + r.ctr1 = union_uint(Tuple(x % UInt32 for x in ctr)) + random123_r(r) + r +end + +@inline inc_counter!(r::AbstractAESNI4x) = (r.ctr1 += one(uint64x2); r) diff --git a/src/aarch64/ars.jl b/src/aarch64/ars.jl new file mode 100644 index 0000000..8571c56 --- /dev/null +++ b/src/aarch64/ars.jl @@ -0,0 +1,163 @@ +import Base: copy, copyto!, == +import Random: rand, seed! +import RandomNumbers: gen_seed, split_uint, union_uint, seed_type, unsafe_copyto!, unsafe_compare + +""" +```julia +ARS1x{R} <: AbstractAESNI1x +ARS1x([seed, R=7]) +``` + +ARS1x is one kind of ARS Counter-Based RNGs. It generates one `UInt128` number at a time. + +`seed` is an `Integer` which will be automatically converted to `UInt128`. + +`R` denotes to the Rounds which should be at least 1 and no more than 10. With 7 rounds (by default), it has +a considerable safety margin over the minimum number of rounds with no known statistical flaws, but still has +excellent performance. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct ARS1x{R} <: AbstractAESNI1x + x::uint64x2 + ctr::uint64x2 + key::uint64x2 +end + +function ARS1x(seed::Integer=gen_seed(UInt128), R::Integer = 7) + R = Int(R) + @assert 1 ≤ R ≤ 10 + m0 = zero(uint64x2) + r = ARS1x{R}(m0, m0, m0) + seed!(r, seed) +end + +function seed!(r::ARS1x, seed::Integer=gen_seed(UInt128)) + r.x = zero(uint64x2) + r.ctr = zero(uint64x2) + r.key = seed % UInt128 + random123_r(r) + r +end + +@inline seed_type(::Type{ARS1x{R}}) where R = UInt128 + +copyto!(dest::ARS1x{R}, src::ARS1x{R}) where R = unsafe_copyto!(dest, src, UInt128, 3) + +copy(src::ARS1x{R}) where R = ARS1x{R}(src.x, src.ctr, src.key) + +==(r1::ARS1x{R}, r2::ARS1x{R}) where R = unsafe_compare(r1, r2, UInt128, 3) + +""" +```julia +ARS4x{R} <: AbstractAESNI4x +ARS4x([seed, R=7]) +``` + +ARS4x is one kind of ARS Counter-Based RNGs. It generates four `UInt32` numbers at a time. + +`seed` is a `Tuple` of four `Integer`s which will all be automatically converted to `UInt32`. + +`R` denotes to the Rounds which must be at least 1 and no more than 10. With 7 rounds (by default), it has a +considerable safety margin over the minimum number of rounds with no known statistical flaws, but still has +excellent performance. + +Only available when [`R123_USE_AESNI`](@ref). +""" +mutable struct ARS4x{R} <: AbstractAESNI4x + x::uint64x2 + ctr1::uint64x2 + key::uint64x2 + p::Int +end + +function ARS4x(seed::NTuple{4, Integer}=gen_seed(UInt32, 4), R::Integer=7) + R = Int(R) + @assert 1 ≤ R ≤ 10 + r = ARS4x{R}(zero(uint64x2), zero(uint64x2), zero(uint64x2), 0) + seed!(r, seed) +end + +function seed!(r::ARS4x, seed::NTuple{4, Integer}=gen_seed(UInt32, 4)) + r.ctr1 = zero(uint64x2) + r.key = union_uint(Tuple(x % UInt32 for x in seed)) + r.p = 0 + random123_r(r) + r +end + +@inline seed_type(::Type{ARS4x{R}}) where R = NTuple{4, UInt32} + +function copyto!(dest::ARS4x{R}, src::ARS4x{R}) where R + unsafe_copyto!(dest, src, UInt128, 3) + dest.p = src.p + dest +end + +copy(src::ARS4x{R}) where R = ARS4x{R}(src.x, src.ctr1, src.key, src.p) + +==(r1::ARS4x{R}, r2::ARS4x{R}) where R = unsafe_compare(r1, r2, UInt128, 3) && r1.p ≡ r2.p + +function expr_ars1xm128i(expr_key, expr_ctr, R) + @assert R isa Int && 1 ≤ R ≤ 10 + rounds = [quote + kk += kweyl + v = _aes_enc(v, kk) + end for _ in 2:R] + quote + ctr = $(expr_ctr) + key = $(expr_key) + kweyl = uint64x2(0xbb67ae8584caa73b, 0x9e3779b97f4a7c15) + kk = key + v = ctr ⊻ kk + q1 = UInt128(ctr) + q2 = UInt128(key) + $(rounds...) + kk += kweyl + ret = _aes_enc_last(v, kk) + end +end + +@generated function ars1xm128i(r::Union{ARS1x{R}, ARS4x{R}}) where R + expr_ctr = if r <: ARS1x + :(r.ctr) + elseif r <: ARS4x + :(r.ctr1) + else + :(error("Unreachable")) + end + expr_key = :(r.key) + expr_ars1xm128i(expr_key, expr_ctr, R) +end + +@generated function ars(key::Tuple{uint64x2}, ctr::Tuple{uint64x2}, ::Val{R})::Tuple{uint64x2} where {R} + :(($(expr_ars1xm128i(:(only(key)), :(only(ctr)), R)),)) +end + +""" + ars(key::Tuple{UInt128}, ctr::Tuple{UInt128}, rounds::Val{R})::Tuple{UInt128} where {R} + +Functional variant of [`ARS1x`](@ref) and [`ARS4x`](@ref). +This function if free of mutability and side effects. +""" +function ars(key::Tuple{UInt128}, ctr::Tuple{UInt128}, rounds::Val{R})::Tuple{UInt128} where {R} + k = map(uint64x2, key) + c = map(uint64x2, ctr) + map(UInt128,ars(k,c,rounds)) +end + +get_key(r::Union{ARS1x, ARS4x}) = (UInt128(r.key),) +get_ctr(r::ARS1x) = (UInt128(r.ctr),) +get_ctr(r::ARS4x) = (UInt128(r.ctr1),) + +@inline function random123_r(r::ARS1x{R}) where R + r.x = ars1xm128i(r) + (UInt128(r.x),) +end + +@inline function random123_r(r::ARS4x{R}) where R + r.x = ars1xm128i(r) + split_uint(UInt128(r.x), UInt32) +end + + diff --git a/test/aarch64/aesni.jl b/test/aarch64/aesni.jl new file mode 100644 index 0000000..fdbef74 --- /dev/null +++ b/test/aarch64/aesni.jl @@ -0,0 +1,26 @@ +import Random: seed! +using Test: @test + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: uint64x2, AESNIKey + +x = zero(uint64x2) +ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) +key = 0x07b8e4b6aa98ec245a7da274d3b8146a +aesni_key = AESNIKey(key) +@test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 + +r = AESNI1x(key) +r1 = AESNI4x(split_uint(key, UInt32)) +@test seed_type(r) ≡ UInt128 +@test seed_type(r1) ≡ NTuple{4, UInt32} +@test copyto!(copy(r), r) == r +@test copyto!(copy(r1), r1) == r1 +@test UInt128(r.x) ≡ rand(r1, UInt128) +@test rand(r, UInt128) ≡ rand(r1, UInt128) +set_counter!(r, 0) +set_counter!(r1, 1) +@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) diff --git a/test/aarch64/ars.jl b/test/aarch64/ars.jl new file mode 100644 index 0000000..181b487 --- /dev/null +++ b/test/aarch64/ars.jl @@ -0,0 +1,29 @@ +import Random: seed! +using Test: @test + +using RandomNumbers +using Random123 + +import RandomNumbers: split_uint +import Random123: uint64x2 + +x = zero(uint64x2) +ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) +key = uint64x2(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) +@test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef +@test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 +@test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xb6621a8b006319e8, 0x67c841642c32fc19)) +@test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xac35df44f996ed82, 0x4e287697bad2f9a2)) + +key = rand(UInt128) +r = ARS1x(key) +r1 = ARS4x(split_uint(key, UInt32)) +@test seed_type(r) ≡ UInt128 +@test seed_type(r1) ≡ NTuple{4, UInt32} +@test copyto!(copy(r), r) == r +@test copyto!(copy(r1), r1) == r1 +@test UInt128(r.x) ≡ rand(r1, UInt128) +@test rand(r, UInt128) ≡ rand(r1, UInt128) +set_counter!(r, 0) +set_counter!(r1, 1) +@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) diff --git a/test/runtests.jl b/test/runtests.jl index 12b7a14..5b092f6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,8 @@ using Printf: @printf seed1 = 1 seed2 = (1,2) seed4 = (1,2,3,4) - alg_choices = [ + AlgChoice = Tuple{Random123.AbstractR123, Function, Union{Tuple{}, Tuple{Val}}} + alg_choices = AlgChoice[ (Threefry2x(UInt32, seed2) , threefry, (Val(20),)) , (Threefry2x(UInt64, seed2) , threefry, (Val(20),)) , (Threefry4x(UInt32, seed4) , threefry, (Val(20),)) , @@ -23,7 +24,7 @@ using Printf: @printf (Philox4x(UInt64 , seed2) , philox , (Val(10),)) , ] if R123_USE_AESNI - append!(alg_choices, [ + append!(alg_choices, AlgChoice[ (AESNI1x(seed1) , aesni , () ) , (AESNI4x(seed4) , aesni , () ) , (ARS1x(seed1) , ars , (Val(7),) ) , @@ -174,4 +175,7 @@ cd(pwd_) if Random123.R123_USE_X86_AES_NI include("./x86/aesni.jl") include("./x86/ars.jl") +elseif Random123.R123_USE_AARCH64_FEAT_AES + include("./aarch64/aesni.jl") + include("./aarch64/ars.jl") end From 2d2ae22963c6618b60465c4135f83c708deaaffe Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Mon, 25 Dec 2023 21:13:48 -0800 Subject: [PATCH 04/12] Fix typos --- src/Random123.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Random123.jl b/src/Random123.jl index ea67910..53c5dd1 100644 --- a/src/Random123.jl +++ b/src/Random123.jl @@ -29,7 +29,7 @@ include("philox.jl") export R123_USE_AESNI -"True when x86 AES-NI instructiona have been detected." +"True when x86 AES-NI instructions have been detected." const R123_USE_X86_AES_NI::Bool = @static if Sys.ARCH ≡ :x86_64 || Sys.ARCH ≡ :i686 try cmd = Base.julia_cmd() @@ -49,7 +49,7 @@ else false end -"True when AArch64 FEAT_AES intrinsics have been detected." +"True when AArch64 FEAT_AES instructions have been detected." const R123_USE_AARCH64_FEAT_AES::Bool = if Sys.ARCH ≡ :aarch64 try cmd = Base.julia_cmd() From 77e98ebbf9a175214d5cc09ec2cdda9fbfe3b52f Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Mon, 25 Dec 2023 22:03:39 -0800 Subject: [PATCH 05/12] Introduce and use `_aes_enc_full` --- src/aarch64/aesni.jl | 18 +------- src/aarch64/aesni_common.jl | 86 ++++++++++++++++++++++--------------- test/runtests.jl | 4 +- 3 files changed, 56 insertions(+), 52 deletions(-) diff --git a/src/aarch64/aesni.jl b/src/aarch64/aesni.jl index b93a52e..3238ee1 100644 --- a/src/aarch64/aesni.jl +++ b/src/aarch64/aesni.jl @@ -211,22 +211,8 @@ get_ctr_uint64x2(o::AESNI1x)::Tuple{uint64x2} = (o.ctr,) get_key(o::Union{AESNI1x, AESNI4x})::NTuple{11,UInt128} = map(UInt128, get_key_uint64x2(o)) get_ctr(o::Union{AESNI1x, AESNI4x})::Tuple{UInt128} = map(UInt128, get_ctr_uint64x2(o)) -@inline function aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2} - key1, key2, key3, key4, key5, key6, key7, key8, key9, key10, key11 = key - ctr1 = only(ctr) - x = key1 ⊻ ctr1 - x = _aes_enc(x, key2) - x = _aes_enc(x, key3) - x = _aes_enc(x, key4) - x = _aes_enc(x, key5) - x = _aes_enc(x, key6) - x = _aes_enc(x, key7) - x = _aes_enc(x, key8) - x = _aes_enc(x, key9) - x = _aes_enc(x, key10) - x = _aes_enc_last(x, key11) - (x,) -end +@inline aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2} = + (_aes_enc_full(only(ctr), key),) """ aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128} diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl index 11dfd82..0834ee7 100644 --- a/src/aarch64/aesni_common.jl +++ b/src/aarch64/aesni_common.jl @@ -10,51 +10,51 @@ const uint64x2_lvec = NTuple{2, VecElement{UInt64}} struct uint64x2 data::uint64x2_lvec end -Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) -Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) -UInt128(x::uint64x2) = convert(UInt128, x) -uint64x2(x::UInt128) = convert(uint64x2, x) -Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x)) -Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) - -uint64x2(hi::UInt64, lo::UInt64) = if LITTLE_ENDIAN +@inline Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint64x2) = convert(UInt128, x) +@inline uint64x2(x::UInt128) = convert(uint64x2, x) +@inline Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline uint64x2(hi::UInt64, lo::UInt64) = @static if LITTLE_ENDIAN uint64x2((VecElement(lo), VecElement(hi))) else uint64x2((VecElement(hi), VecElement(lo))) end -Base.zero(::Type{uint64x2}) = convert(uint64x2, 0) -Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64)) -Base.xor(a::uint64x2, b::uint64x2) = llvmcall( +@inline Base.zero(::Type{uint64x2}) = convert(uint64x2, 0) +@inline Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64)) +@inline Base.xor(a::uint64x2, b::uint64x2) = llvmcall( """%3 = xor <2 x i64> %1, %0 ret <2 x i64> %3""", uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, a.data, b.data, ) |> uint64x2 -(+)(a::uint64x2, b::uint64x2) = llvmcall( +@inline (+)(a::uint64x2, b::uint64x2) = llvmcall( """%3 = add <2 x i64> %1, %0 ret <2 x i64> %3""", uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec}, a.data, b.data, ) |> uint64x2 -(+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b)) +@inline (+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b)) const uint8x16_lvec = NTuple{16, VecElement{UInt8}} struct uint8x16 data::uint8x16_lvec end -Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) -Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) -uint8x16(x::uint64x2) = convert(uint8x16, x) -uint64x2(x::uint8x16) = convert(uint64x2, x) -Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) -Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) -UInt128(x::uint8x16) = convert(UInt128, x) -uint8x16(x::UInt128) = convert(uint8x16, x) -Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x)) -Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) - -function uint8x16(bytes::Vararg{UInt8, 16}) +@inline Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline uint8x16(x::uint64x2) = convert(uint8x16, x) +@inline uint64x2(x::uint8x16) = convert(uint64x2, x) +@inline Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint8x16) = convert(UInt128, x) +@inline uint8x16(x::UInt128) = convert(uint8x16, x) +@inline Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline function uint8x16(bytes::Vararg{UInt8, 16}) bytes_prepped = bytes if LITTLE_ENDIAN bytes_prepped = reverse(bytes_prepped) @@ -63,8 +63,8 @@ function uint8x16(bytes::Vararg{UInt8, 16}) return uint8x16(bytes_vec) end -Base.zero(::Type{uint8x16}) = convert(uint8x16, 0) -Base.xor(a::uint8x16, b::uint8x16) = llvmcall( +@inline Base.zero(::Type{uint8x16}) = convert(uint8x16, 0) +@inline Base.xor(a::uint8x16, b::uint8x16) = llvmcall( """%3 = xor <16 x i8> %1, %0 ret <16 x i8> %3""", uint8x16_lvec, Tuple{uint8x16_lvec, uint8x16_lvec}, @@ -72,14 +72,14 @@ Base.xor(a::uint8x16, b::uint8x16) = llvmcall( ) |> uint8x16 # Raw NEON instrinsics, provided by FEAT_AES -_vaese(a::uint8x16, b::uint8x16) = ccall( +@inline _vaese(a::uint8x16, b::uint8x16) = ccall( "llvm.aarch64.crypto.aese", llvmcall, uint8x16_lvec, (uint8x16_lvec, uint8x16_lvec), a.data, b.data, ) |> uint8x16 -_vaesmc(a::uint8x16) = ccall( +@inline _vaesmc(a::uint8x16) = ccall( "llvm.aarch64.crypto.aesmc", llvmcall, uint8x16_lvec, @@ -104,7 +104,7 @@ uint8x16_t _mm_aeskeygenassist_helper(uint8x16_t a) ``` Then made architecture-agnostic as LLVM IR. """ -_aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall( +@inline _aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall( """%2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> ret <16 x i8> %2""", uint8x16_lvec, Tuple{uint8x16_lvec}, @@ -116,21 +116,39 @@ _aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall( # Algorithm translations courtesy of the SIMD Everywhere and SSE2NEON projects: # https://github.com/simd-everywhere/simde/blob/v0.8.0-rc1/simde/x86/aes.h # https://github.com/DLTcollab/sse2neon/blob/v1.6.0/sse2neon.h -function _aes_enc(a::uint64x2, round_key::uint64x2) +@inline function _aes_enc(a::uint64x2, round_key::uint64x2) res = _vaesmc(_vaese(uint8x16(a), zero(uint8x16))) return uint64x2(res) ⊻ round_key end -function _aes_enc_last(a::uint64x2, round_key::uint64x2) +@inline function _aes_enc_last(a::uint64x2, round_key::uint64x2) res = _vaese(uint8x16(a), zero(uint8x16)) return uint64x2(res) ⊻ round_key end - -function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R} +@inline function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R} res = _aes_key_gen_shuffle_helper(_vaese(uint8x16(a), zero(uint8x16))) r = R % UInt64 return uint64x2(res) ⊻ uint64x2(r, r) end +""" + _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N} + +Full AES encryption flow for N rounds. +""" +@inline function _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N} + res = uint8x16(a) + for (i, key) in enumerate(round_keys) + if i ≢ N + res = _vaese(res, uint8x16(key)) + if i ≢ N - 1 + res = _vaesmc(res) + end + else + return uint64x2(res ⊻ uint8x16(key)) + end + end +end + "Abstract RNG that generates one number at a time and is based on AESNI." abstract type AbstractAESNI1x <: R123Generator1x{UInt128} end "Abstract RNG that generates four numbers at a time and is based on AESNI." diff --git a/test/runtests.jl b/test/runtests.jl index 5b092f6..913ad19 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,7 +23,7 @@ using Printf: @printf (Philox4x(UInt32 , seed2) , philox , (Val(10),)) , (Philox4x(UInt64 , seed2) , philox , (Val(10),)) , ] - if R123_USE_AESNI + @static if R123_USE_AESNI append!(alg_choices, AlgChoice[ (AESNI1x(seed1) , aesni , () ) , (AESNI4x(seed4) , aesni , () ) , @@ -172,7 +172,7 @@ redirect_stdout(stdout_) compare_dirs("expected", "actual") cd(pwd_) -if Random123.R123_USE_X86_AES_NI +@static if Random123.R123_USE_X86_AES_NI include("./x86/aesni.jl") include("./x86/ars.jl") elseif Random123.R123_USE_AARCH64_FEAT_AES From b26bf4b3fe1fd7cdea0c50edb403ab43e52c87c4 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Wed, 27 Dec 2023 08:22:24 -0800 Subject: [PATCH 06/12] Handle 0-round case for _aes_enc_full --- src/aarch64/aesni_common.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl index 0834ee7..449c4dc 100644 --- a/src/aarch64/aesni_common.jl +++ b/src/aarch64/aesni_common.jl @@ -147,6 +147,7 @@ Full AES encryption flow for N rounds. return uint64x2(res ⊻ uint8x16(key)) end end + return a # pathological 0-round case end "Abstract RNG that generates one number at a time and is based on AESNI." From dcc8df6a3a8a60e782b7d9d55747cf980f3e29dd Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Wed, 27 Dec 2023 19:36:52 -0800 Subject: [PATCH 07/12] Wrap AES acceleration tests in testsets --- test/aarch64/aesni.jl | 36 +++++++++++++++++++----------------- test/aarch64/ars.jl | 42 ++++++++++++++++++++++-------------------- test/x86/aesni.jl | 36 +++++++++++++++++++----------------- test/x86/ars.jl | 42 ++++++++++++++++++++++-------------------- 4 files changed, 82 insertions(+), 74 deletions(-) diff --git a/test/aarch64/aesni.jl b/test/aarch64/aesni.jl index fdbef74..8a83fcb 100644 --- a/test/aarch64/aesni.jl +++ b/test/aarch64/aesni.jl @@ -1,5 +1,5 @@ import Random: seed! -using Test: @test +using Test: @test, @testset using RandomNumbers using Random123 @@ -7,20 +7,22 @@ using Random123 import RandomNumbers: split_uint import Random123: uint64x2, AESNIKey -x = zero(uint64x2) -ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = 0x07b8e4b6aa98ec245a7da274d3b8146a -aesni_key = AESNIKey(key) -@test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 +@testset "Accelerated AESNI" begin + x = zero(uint64x2) + ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = 0x07b8e4b6aa98ec245a7da274d3b8146a + aesni_key = AESNIKey(key) + @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 -r = AESNI1x(key) -r1 = AESNI4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) + r = AESNI1x(key) + r1 = AESNI4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/aarch64/ars.jl b/test/aarch64/ars.jl index 181b487..426dde5 100644 --- a/test/aarch64/ars.jl +++ b/test/aarch64/ars.jl @@ -1,5 +1,5 @@ import Random: seed! -using Test: @test +using Test: @test, @testset using RandomNumbers using Random123 @@ -7,23 +7,25 @@ using Random123 import RandomNumbers: split_uint import Random123: uint64x2 -x = zero(uint64x2) -ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = uint64x2(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) -@test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef -@test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 -@test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xb6621a8b006319e8, 0x67c841642c32fc19)) -@test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xac35df44f996ed82, 0x4e287697bad2f9a2)) +@testset "Accelerated ARS" begin + x = zero(uint64x2) + ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = uint64x2(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) + @test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef + @test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 + @test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xb6621a8b006319e8, 0x67c841642c32fc19)) + @test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(uint64x2(0xac35df44f996ed82, 0x4e287697bad2f9a2)) -key = rand(UInt128) -r = ARS1x(key) -r1 = ARS4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) + key = rand(UInt128) + r = ARS1x(key) + r1 = ARS4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/x86/aesni.jl b/test/x86/aesni.jl index 79d1129..5adc6a5 100644 --- a/test/x86/aesni.jl +++ b/test/x86/aesni.jl @@ -1,5 +1,5 @@ import Random: seed! -using Test: @test +using Test: @test, @testset using RandomNumbers using Random123 @@ -7,20 +7,22 @@ using Random123 import RandomNumbers: split_uint import Random123: __m128i, AESNIKey -x = zero(__m128i) -ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = 0x07b8e4b6aa98ec245a7da274d3b8146a -aesni_key = AESNIKey(key) -@test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 +@testset "Accelerated AESNI" begin + x = zero(__m128i) + ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = 0x07b8e4b6aa98ec245a7da274d3b8146a + aesni_key = AESNIKey(key) + @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 -r = AESNI1x(key) -r1 = AESNI4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) + r = AESNI1x(key) + r1 = AESNI4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end diff --git a/test/x86/ars.jl b/test/x86/ars.jl index 1cbd9c9..38355d0 100644 --- a/test/x86/ars.jl +++ b/test/x86/ars.jl @@ -1,5 +1,5 @@ import Random: seed! -using Test: @test +using Test: @test, @testset using RandomNumbers using Random123 @@ -7,23 +7,25 @@ using Random123 import RandomNumbers: split_uint import Random123: __m128i -x = zero(__m128i) -ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) -key = __m128i(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) -@test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef -@test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 -@test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xb6621a8b006319e8, 0x67c841642c32fc19)) -@test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xac35df44f996ed82, 0x4e287697bad2f9a2)) +@testset "Accelerated ARS" begin + x = zero(__m128i) + ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + key = __m128i(0x07b8e4b6aa98ec24, 0x5a7da274d3b8146a) + @test rand(ARS1x{1}(x, ctr, key), UInt128) ≡ 0x1a0b14c707b64224e548ef12331396ef + @test rand(ARS1x{2}(x, ctr, key), UInt128) ≡ 0x3ced8e0970690f718336318ba22e8ae1 + @test rand(ARS1x{3}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xb6621a8b006319e8, 0x67c841642c32fc19)) + @test rand(ARS1x{10}(x, ctr, key), UInt128) ≡ UInt128(__m128i(0xac35df44f996ed82, 0x4e287697bad2f9a2)) -key = rand(UInt128) -r = ARS1x(key) -r1 = ARS4x(split_uint(key, UInt32)) -@test seed_type(r) ≡ UInt128 -@test seed_type(r1) ≡ NTuple{4, UInt32} -@test copyto!(copy(r), r) == r -@test copyto!(copy(r1), r1) == r1 -@test UInt128(r.x) ≡ rand(r1, UInt128) -@test rand(r, UInt128) ≡ rand(r1, UInt128) -set_counter!(r, 0) -set_counter!(r1, 1) -@test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) + key = rand(UInt128) + r = ARS1x(key) + r1 = ARS4x(split_uint(key, UInt32)) + @test seed_type(r) ≡ UInt128 + @test seed_type(r1) ≡ NTuple{4, UInt32} + @test copyto!(copy(r), r) == r + @test copyto!(copy(r1), r1) == r1 + @test UInt128(r.x) ≡ rand(r1, UInt128) + @test rand(r, UInt128) ≡ rand(r1, UInt128) + set_counter!(r, 0) + set_counter!(r1, 1) + @test rand(r, Tuple{UInt128})[1] ≡ rand(r1, UInt128) +end From cfa870413bc45ae16e6d057e333c689caa1e4835 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Wed, 27 Dec 2023 19:43:21 -0800 Subject: [PATCH 08/12] Endian-agnostic exploitation of conversion (changes nothing) --- test/aarch64/aesni.jl | 2 +- test/x86/aesni.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/aarch64/aesni.jl b/test/aarch64/aesni.jl index 8a83fcb..256df60 100644 --- a/test/aarch64/aesni.jl +++ b/test/aarch64/aesni.jl @@ -9,7 +9,7 @@ import Random123: uint64x2, AESNIKey @testset "Accelerated AESNI" begin x = zero(uint64x2) - ctr = uint64x2(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + ctr = uint64x2(0x9799b5d54f7b9227b47607190d0dfefb) key = 0x07b8e4b6aa98ec245a7da274d3b8146a aesni_key = AESNIKey(key) @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 diff --git a/test/x86/aesni.jl b/test/x86/aesni.jl index 5adc6a5..0c112fe 100644 --- a/test/x86/aesni.jl +++ b/test/x86/aesni.jl @@ -9,7 +9,7 @@ import Random123: __m128i, AESNIKey @testset "Accelerated AESNI" begin x = zero(__m128i) - ctr = __m128i(0x9799b5d54f7b9227, 0xb47607190d0dfefb) + ctr = __m128i(0x9799b5d54f7b9227b47607190d0dfefb) key = 0x07b8e4b6aa98ec245a7da274d3b8146a aesni_key = AESNIKey(key) @test rand(AESNI1x(x, ctr, aesni_key), UInt128) ≡ 0x60f4c27fe48fe1b8c5f4568a585b0dc0 From 6605004360e29006292631c65f6c749c46fc06e7 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Sat, 17 Feb 2024 16:56:44 -0800 Subject: [PATCH 09/12] Make all endianness decisions static --- src/aarch64/aesni_common.jl | 2 +- src/x86/aesni_common.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl index 449c4dc..8042426 100644 --- a/src/aarch64/aesni_common.jl +++ b/src/aarch64/aesni_common.jl @@ -56,7 +56,7 @@ end @inline function uint8x16(bytes::Vararg{UInt8, 16}) bytes_prepped = bytes - if LITTLE_ENDIAN + @static if LITTLE_ENDIAN bytes_prepped = reverse(bytes_prepped) end bytes_vec::uint8x16_lvec = VecElement.(bytes_prepped) diff --git a/src/x86/aesni_common.jl b/src/x86/aesni_common.jl index fbb440d..6d70829 100644 --- a/src/x86/aesni_common.jl +++ b/src/x86/aesni_common.jl @@ -16,7 +16,7 @@ Base.convert(::Type{__m128i}, x::Union{Signed, Unsigned}) = convert(__m128i, UIn Base.convert(::Type{T}, x::__m128i) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) const LITTLE_ENDIAN = ENDIAN_BOM ≡ 0x04030201 -__m128i(hi::UInt64, lo::UInt64) = LITTLE_ENDIAN ? __m128i((VecElement(lo), VecElement(hi))) : __m128i((VecElement(hi), VecElement(lo))) +__m128i(hi::UInt64, lo::UInt64) = @static LITTLE_ENDIAN ? __m128i((VecElement(lo), VecElement(hi))) : __m128i((VecElement(hi), VecElement(lo))) Base.zero(::Type{__m128i}) = __m128i(zero(UInt64), zero(UInt64)) Base.one(::Type{__m128i}) = __m128i(zero(UInt64), one(UInt64)) From ee0f5344e4534dc874bca0801f2acb17232702c8 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Sat, 17 Feb 2024 16:58:09 -0800 Subject: [PATCH 10/12] Fix keygenassist emulation There was a weird internal endianness issue. --- src/aarch64/aesni_common.jl | 41 +++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl index 8042426..3739760 100644 --- a/src/aarch64/aesni_common.jl +++ b/src/aarch64/aesni_common.jl @@ -71,6 +71,42 @@ end a.data, b.data, ) |> uint8x16 +const uint32x4_lvec = NTuple{4, VecElement{UInt32}} +struct uint32x4 + data::uint32x4_lvec +end +@inline Base.convert(::Type{uint64x2}, x::uint32x4) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint32x4}, x::uint64x2) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline uint32x4(x::uint64x2) = convert(uint32x4, x) +@inline uint64x2(x::uint32x4) = convert(uint64x2, x) +@inline Base.convert(::Type{uint8x16}, x::uint32x4) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{uint32x4}, x::uint8x16) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline uint32x4(x::uint8x16) = convert(uint32x4, x) +@inline uint8x16(x::uint32x4) = convert(uint8x16, x) +@inline Base.convert(::Type{uint32x4}, x::UInt128) = unsafe_load(Ptr{uint32x4}(pointer_from_objref(Ref(x)))) +@inline Base.convert(::Type{UInt128}, x::uint32x4) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x)))) +@inline UInt128(x::uint32x4) = convert(UInt128, x) +@inline uint32x4(x::UInt128) = convert(uint32x4, x) +@inline Base.convert(::Type{uint32x4}, x::Union{Signed, Unsigned}) = convert(uint32x4, UInt128(x)) +@inline Base.convert(::Type{T}, x::uint32x4) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x)) + +@inline function uint32x4(bytes::Vararg{UInt32, 4}) + bytes_prepped = bytes + @static if LITTLE_ENDIAN + bytes_prepped = reverse(bytes_prepped) + end + bytes_vec::uint32x4_lvec = VecElement.(bytes_prepped) + return uint32x4(bytes_vec) +end + +@inline Base.zero(::Type{uint32x4}) = convert(uint32x4, 0) +@inline Base.xor(a::uint32x4, b::uint32x4) = llvmcall( + """%3 = xor <4 x i32> %1, %0 + ret <4 x i32> %3""", + uint32x4_lvec, Tuple{uint32x4_lvec, uint32x4_lvec}, + a.data, b.data, +) |> uint32x4 + # Raw NEON instrinsics, provided by FEAT_AES @inline _vaese(a::uint8x16, b::uint8x16) = ccall( "llvm.aarch64.crypto.aese", @@ -126,8 +162,9 @@ end end @inline function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R} res = _aes_key_gen_shuffle_helper(_vaese(uint8x16(a), zero(uint8x16))) - r = R % UInt64 - return uint64x2(res) ⊻ uint64x2(r, r) + r = R % UInt32 + z = zero(UInt32) + return uint64x2(res) ⊻ uint64x2(uint32x4(r, z, r, z)) end """ From e66aed9304f6181912890c0762f2b5ba5132e75a Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Sat, 17 Feb 2024 17:14:25 -0800 Subject: [PATCH 11/12] Fix docs typo --- src/aarch64/aesni.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aarch64/aesni.jl b/src/aarch64/aesni.jl index 3238ee1..fa3c4fb 100644 --- a/src/aarch64/aesni.jl +++ b/src/aarch64/aesni.jl @@ -29,7 +29,7 @@ copy(src::AESNIKey) = copyto!(AESNIKey(), src) Assistant function for AES128. Originally compiled for x86 from the C++ source code: ```cpp R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { - uint64x2 temp3; + __m128i temp3; temp2 = _mm_shuffle_epi32 (temp2 ,0xff); temp3 = _mm_slli_si128 (temp1, 0x4); temp1 = _mm_xor_si128 (temp1, temp3); From 4e4b57ecfb8158671a53c92b5596d2c841e738e2 Mon Sep 17 00:00:00 2001 From: Laine Taffin Altman Date: Sat, 17 Feb 2024 22:19:09 -0800 Subject: [PATCH 12/12] Minor compilation efficiency thing Doesn't affect runtime behavior --- src/aarch64/aesni_common.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl index 3739760..020cee6 100644 --- a/src/aarch64/aesni_common.jl +++ b/src/aarch64/aesni_common.jl @@ -23,7 +23,7 @@ else uint64x2((VecElement(hi), VecElement(lo))) end -@inline Base.zero(::Type{uint64x2}) = convert(uint64x2, 0) +@inline Base.zero(::Type{uint64x2}) = convert(uint64x2, zero(UInt128)) @inline Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64)) @inline Base.xor(a::uint64x2, b::uint64x2) = llvmcall( """%3 = xor <2 x i64> %1, %0 @@ -63,7 +63,7 @@ end return uint8x16(bytes_vec) end -@inline Base.zero(::Type{uint8x16}) = convert(uint8x16, 0) +@inline Base.zero(::Type{uint8x16}) = convert(uint8x16, zero(UInt128)) @inline Base.xor(a::uint8x16, b::uint8x16) = llvmcall( """%3 = xor <16 x i8> %1, %0 ret <16 x i8> %3""", @@ -99,7 +99,7 @@ end return uint32x4(bytes_vec) end -@inline Base.zero(::Type{uint32x4}) = convert(uint32x4, 0) +@inline Base.zero(::Type{uint32x4}) = convert(uint32x4, zero(UInt128)) @inline Base.xor(a::uint32x4, b::uint32x4) = llvmcall( """%3 = xor <4 x i32> %1, %0 ret <4 x i32> %3""",