-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Description
One of the fun things you can do with Cartesian is easily generate unrolled matrix multiplication algorithms. This might be particularly useful for a fixed-size array type (see also ImmutableArrays; the major difference is that ImmutableArrays generates types called Matrix2x3 etc, whereas here the size is encoded as a parameter).
Here's a short proof-of-concept demo (which takes many shortcuts in the interest of brevity):
module FixedArrays
using Base.Cartesian
import Base: A_mul_B!
type FixedArray{T,N,SZ,L} <: DenseArray{T,N}
data::NTuple{L,T}
end
function fixedvector{T}(::Type{T}, len::Integer, data)
length(data) == len || error("The vector length $len is inconsistent with $(length(data)) elements")
FixedArray{T,1,(int(len),),int(len)}(ntuple(length(x), i->convert(T, data[i]))) # needs to be faster than this
end
fixedvector(data) = fixedvector(typeof(data[1]), length(data), data)
function fixedmatrix{T}(::Type{T}, cols::Integer, rows::Integer, data)
rows*cols == length(data) || error("The number $(length(data)) of elements supplied is not equal to $(cols*rows)")
FixedArray{T,2,(int(cols),int(rows)),int(rows)*int(cols)}(ntuple(length(data), i->convert(T, data[i])))
end
fixedmatrix(data::AbstractMatrix) = fixedmatrix(eltype(data), size(data,1), size(data,2), data)
# For now, output is an array to bypass the slow FixedArray constructor
for N=1:4, K=1:4, M=1:4
@eval begin
function A_mul_B_FA!{TA,TB}(dest::AbstractMatrix, A::FixedArray{TA,2,($M,$K)}, B::FixedArray{TB,2,($K,$N)})
TP = typeof(one(TA)*one(TB) + one(TA)*one(TB))
@nexprs $K d2->(@nexprs $M d1->A_d1_d2 = A.data[(d2-1)*$M+d1])
@nexprs $N d2->(@nexprs $K d1->B_d1_d2 = B.data[(d2-1)*$K+d1])
@nexprs $N n->(@nexprs $M m->begin
tmp = zero(TP)
@nexprs $K k->(tmp += A_m_k * B_k_n)
dest[m,n] = tmp
end)
dest
end
end
end
end
and the results in a session:
julia> AA = rand(2,2)
2x2 Array{Float64,2}:
0.73037 0.525699
0.581208 0.973336
julia> BB = rand(2,3)
2x3 Array{Float64,2}:
0.527632 0.0111126 0.701888
0.840294 0.00627326 0.450177
julia> include("FixedArrays.jl")
julia> A = FixedArrays.fixedmatrix(AA);
julia> B = FixedArrays.fixedmatrix(BB);
julia> C = Array(Float64, 2, 3);
julia> FixedArrays.A_mul_B_FA!(C, A, B)
2x3 Array{Float64,2}:
0.827108 0.0114142 0.749295
1.12455 0.0125647 0.846116
julia> AA*BB
2x3 Array{Float64,2}:
0.827108 0.0114142 0.749295
1.12455 0.0125647 0.846116
julia> @time (for i = 1:10^5; FixedArrays.A_mul_B_FA!(C, A, B); end)
elapsed time: 0.004008817 seconds (0 bytes allocated)
julia> @time (for i = 1:10^5; FixedArrays.A_mul_B_FA!(C, A, B); end)
elapsed time: 0.004217504 seconds (0 bytes allocated)
julia> @time (for i = 1:10^5; A_mul_B!(C, AA, BB); end)
elapsed time: 0.099574707 seconds (3856 bytes allocated)
julia> @time (for i = 1:10^5; A_mul_B!(C, AA, BB); end)
elapsed time: 0.098794949 seconds (0 bytes allocated)
Because of the use of globals, this 20-fold improvement may even underestimate the advantage this would confer.
Do we want this? If so, I recommend it as a GSoC project. It would actually be quite a lot of work, especially to get these working in conjunction with more traditional array types---the number of required methods is not small and indeed somewhat scary.
For the record, here's what the generated code looks like for the case of 2x2 multiplication (look Ma, no loops!):
julia> using Base.Cartesian
julia> M = K = N = 2
2
julia> macroexpand(:(function A_mul_B_FA!{TA,TB}(dest::AbstractMatrix, A::FixedArray{TA,2,($M,$K)}, B::FixedArray{TB,2,($K,$N)})
TP = typeof(one(TA)*one(TB) + one(TA)*one(TB))
@nexprs $K d2->(@nexprs $M d1->A_d1_d2 = A.data[(d2-1)*$M+d1])
@nexprs $N d2->(@nexprs $K d1->B_d1_d2 = B.data[(d2-1)*$K+d1])
@nexprs $N n->(@nexprs $M m->begin
tmp = zero(TP)
@nexprs $K k->(tmp += A_m_k * B_k_n)
dest[m,n] = tmp
end)
dest
end))
:(function A_mul_B_FA!{TA,TB}(dest::AbstractMatrix,A::FixedArray{TA,2,(2,2)},B::FixedArray{TB,2,(2,2)}) # none, line 2:
TP = typeof(one(TA) * one(TB) + one(TA) * one(TB)) # line 3:
begin
begin
A_1_1 = A.data[1]
A_2_1 = A.data[2]
end
begin
A_1_2 = A.data[3]
A_2_2 = A.data[4]
end
end # line 4:
begin
begin
B_1_1 = B.data[1]
B_2_1 = B.data[2]
end
begin
B_1_2 = B.data[3]
B_2_2 = B.data[4]
end
end # line 5:
begin
begin
begin # none, line 6:
tmp = zero(TP) # line 7:
begin
tmp += A_1_1 * B_1_1
tmp += A_1_2 * B_2_1
end # line 8:
dest[1,1] = tmp
end
begin # none, line 6:
tmp = zero(TP) # line 7:
begin
tmp += A_2_1 * B_1_1
tmp += A_2_2 * B_2_1
end # line 8:
dest[2,1] = tmp
end
end
begin
begin # none, line 6:
tmp = zero(TP) # line 7:
begin
tmp += A_1_1 * B_1_2
tmp += A_1_2 * B_2_2
end # line 8:
dest[1,2] = tmp
end
begin # none, line 6:
tmp = zero(TP) # line 7:
begin
tmp += A_2_1 * B_1_2
tmp += A_2_2 * B_2_2
end # line 8:
dest[2,2] = tmp
end
end
end # line 10:
dest
end)