diff --git a/libff/CMakeLists.txt b/libff/CMakeLists.txt
index 4995b754..586571be 100755
--- a/libff/CMakeLists.txt
+++ b/libff/CMakeLists.txt
@@ -74,6 +74,18 @@ install(
   TARGETS ff DESTINATION lib
 )
 
+add_executable(
+  multiexp_profile
+  EXCLUDE_FROM_ALL
+
+  algebra/scalar_multiplication/multiexp_profile.cpp
+)
+target_link_libraries(
+  multiexp_profile
+
+  ff
+)
+
 # Tests
 add_executable(
   algebra_bilinearity_test
diff --git a/libff/algebra/curves/alt_bn128/alt_bn128_g1.cpp b/libff/algebra/curves/alt_bn128/alt_bn128_g1.cpp
index cd5bb778..214e96c2 100755
--- a/libff/algebra/curves/alt_bn128/alt_bn128_g1.cpp
+++ b/libff/algebra/curves/alt_bn128/alt_bn128_g1.cpp
@@ -496,8 +496,7 @@ std::istream& operator>>(std::istream& in, std::vector<alt_bn128_G1> &v)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<alt_bn128_G1>(std::vector<alt_bn128_G1> &vec)
+void alt_bn128_G1::batch_to_special_all_non_zeros(std::vector<alt_bn128_G1> &vec)
 {
     std::vector<alt_bn128_Fq> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/alt_bn128/alt_bn128_g1.hpp b/libff/algebra/curves/alt_bn128/alt_bn128_g1.hpp
index 46726ac1..bfeed20f 100755
--- a/libff/algebra/curves/alt_bn128/alt_bn128_g1.hpp
+++ b/libff/algebra/curves/alt_bn128/alt_bn128_g1.hpp
@@ -70,6 +70,8 @@ class alt_bn128_G1 {
 
     friend std::ostream& operator<<(std::ostream &out, const alt_bn128_G1 &g);
     friend std::istream& operator>>(std::istream &in, alt_bn128_G1 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<alt_bn128_G1> &vec);
 };
 
 template<mp_size_t m>
@@ -87,10 +89,5 @@ alt_bn128_G1 operator*(const Fp_model<m,modulus_p> &lhs, const alt_bn128_G1 &rhs
 std::ostream& operator<<(std::ostream& out, const std::vector<alt_bn128_G1> &v);
 std::istream& operator>>(std::istream& in, std::vector<alt_bn128_G1> &v);
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<alt_bn128_G1>(std::vector<alt_bn128_G1> &vec);
-
 } // libff
 #endif // ALT_BN128_G1_HPP_
diff --git a/libff/algebra/curves/alt_bn128/alt_bn128_g2.cpp b/libff/algebra/curves/alt_bn128/alt_bn128_g2.cpp
index 8887347e..4e8c9944 100755
--- a/libff/algebra/curves/alt_bn128/alt_bn128_g2.cpp
+++ b/libff/algebra/curves/alt_bn128/alt_bn128_g2.cpp
@@ -477,8 +477,7 @@ std::istream& operator>>(std::istream &in, alt_bn128_G2 &g)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<alt_bn128_G2>(std::vector<alt_bn128_G2> &vec)
+void alt_bn128_G2::batch_to_special_all_non_zeros(std::vector<alt_bn128_G2> &vec)
 {
     std::vector<alt_bn128_Fq2> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/alt_bn128/alt_bn128_g2.hpp b/libff/algebra/curves/alt_bn128/alt_bn128_g2.hpp
index 54bd6a46..2b628ebf 100755
--- a/libff/algebra/curves/alt_bn128/alt_bn128_g2.hpp
+++ b/libff/algebra/curves/alt_bn128/alt_bn128_g2.hpp
@@ -74,6 +74,8 @@ class alt_bn128_G2 {
 
     friend std::ostream& operator<<(std::ostream &out, const alt_bn128_G2 &g);
     friend std::istream& operator>>(std::istream &in, alt_bn128_G2 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<alt_bn128_G2> &vec);
 };
 
 template<mp_size_t m>
@@ -88,10 +90,6 @@ alt_bn128_G2 operator*(const Fp_model<m,modulus_p> &lhs, const alt_bn128_G2 &rhs
     return scalar_mul<alt_bn128_G2, m>(rhs, lhs.as_bigint());
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<alt_bn128_G2>(std::vector<alt_bn128_G2> &vec);
 
 } // libff
 #endif // ALT_BN128_G2_HPP_
diff --git a/libff/algebra/curves/bn128/bn128_g1.cpp b/libff/algebra/curves/bn128/bn128_g1.cpp
index 44685782..b8389e68 100755
--- a/libff/algebra/curves/bn128/bn128_g1.cpp
+++ b/libff/algebra/curves/bn128/bn128_g1.cpp
@@ -491,8 +491,7 @@ std::istream& operator>>(std::istream& in, std::vector<bn128_G1> &v)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<bn128_G1>(std::vector<bn128_G1> &vec)
+void bn128_G1::batch_to_special_all_non_zeros(std::vector<bn128_G1> &vec)
 {
     std::vector<bn::Fp> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/bn128/bn128_g1.hpp b/libff/algebra/curves/bn128/bn128_g1.hpp
index 5049b592..fe2c8976 100755
--- a/libff/algebra/curves/bn128/bn128_g1.hpp
+++ b/libff/algebra/curves/bn128/bn128_g1.hpp
@@ -70,6 +70,8 @@ class bn128_G1 {
 
     friend std::ostream& operator<<(std::ostream &out, const bn128_G1 &g);
     friend std::istream& operator>>(std::istream &in, bn128_G1 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<bn128_G1> &vec);
 };
 
 template<mp_size_t m>
@@ -87,10 +89,6 @@ bn128_G1 operator*(const Fp_model<m,modulus_p> &lhs, const bn128_G1 &rhs)
 std::ostream& operator<<(std::ostream& out, const std::vector<bn128_G1> &v);
 std::istream& operator>>(std::istream& in, std::vector<bn128_G1> &v);
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<bn128_G1>(std::vector<bn128_G1> &vec);
 
 } // libff
 #endif // BN128_G1_HPP_
diff --git a/libff/algebra/curves/bn128/bn128_g2.cpp b/libff/algebra/curves/bn128/bn128_g2.cpp
index 7a6c3476..cd8eb72f 100755
--- a/libff/algebra/curves/bn128/bn128_g2.cpp
+++ b/libff/algebra/curves/bn128/bn128_g2.cpp
@@ -474,8 +474,7 @@ std::istream& operator>>(std::istream &in, bn128_G2 &g)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<bn128_G2>(std::vector<bn128_G2> &vec)
+void bn128_G2::batch_to_special_all_non_zeros(std::vector<bn128_G2> &vec)
 {
     std::vector<bn::Fp2> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/bn128/bn128_g2.hpp b/libff/algebra/curves/bn128/bn128_g2.hpp
index 45e4e5c2..3b5fccc5 100755
--- a/libff/algebra/curves/bn128/bn128_g2.hpp
+++ b/libff/algebra/curves/bn128/bn128_g2.hpp
@@ -71,6 +71,8 @@ class bn128_G2 {
 
     friend std::ostream& operator<<(std::ostream &out, const bn128_G2 &g);
     friend std::istream& operator>>(std::istream &in, bn128_G2 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<bn128_G2> &vec);
 };
 
 template<mp_size_t m>
@@ -85,10 +87,5 @@ bn128_G2 operator*(const Fp_model<m, modulus_p> &lhs, const bn128_G2 &rhs)
     return scalar_mul<bn128_G2, m>(rhs, lhs.as_bigint());
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<bn128_G2>(std::vector<bn128_G2> &vec);
-
 } // libff
 #endif // BN128_G2_HPP_
diff --git a/libff/algebra/curves/edwards/edwards_g1.cpp b/libff/algebra/curves/edwards/edwards_g1.cpp
index 9db382a1..d7749ada 100755
--- a/libff/algebra/curves/edwards/edwards_g1.cpp
+++ b/libff/algebra/curves/edwards/edwards_g1.cpp
@@ -388,8 +388,7 @@ std::istream& operator>>(std::istream& in, std::vector<edwards_G1> &v)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<edwards_G1>(std::vector<edwards_G1> &vec)
+void edwards_G1::batch_to_special_all_non_zeros(std::vector<edwards_G1> &vec)
 {
     std::vector<edwards_Fq> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/edwards/edwards_g1.hpp b/libff/algebra/curves/edwards/edwards_g1.hpp
index 33ee926c..91cc4d18 100755
--- a/libff/algebra/curves/edwards/edwards_g1.hpp
+++ b/libff/algebra/curves/edwards/edwards_g1.hpp
@@ -72,6 +72,8 @@ class edwards_G1 {
 
     friend std::ostream& operator<<(std::ostream &out, const edwards_G1 &g);
     friend std::istream& operator>>(std::istream &in, edwards_G1 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<edwards_G1> &vec);
 };
 
 template<mp_size_t m>
@@ -89,10 +91,5 @@ edwards_G1 operator*(const Fp_model<m,modulus_p> &lhs, const edwards_G1 &rhs)
 std::ostream& operator<<(std::ostream& out, const std::vector<edwards_G1> &v);
 std::istream& operator>>(std::istream& in, std::vector<edwards_G1> &v);
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<edwards_G1>(std::vector<edwards_G1> &vec);
-
 } // libff
 #endif // EDWARDS_G1_HPP_
diff --git a/libff/algebra/curves/edwards/edwards_g2.cpp b/libff/algebra/curves/edwards/edwards_g2.cpp
index 118a23d4..923a5451 100755
--- a/libff/algebra/curves/edwards/edwards_g2.cpp
+++ b/libff/algebra/curves/edwards/edwards_g2.cpp
@@ -389,10 +389,7 @@ std::istream& operator>>(std::istream &in, edwards_G2 &g)
     return in;
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<edwards_G2>(std::vector<edwards_G2> &vec)
+void edwards_G2::batch_to_special_all_non_zeros(std::vector<edwards_G2> &vec)
 {
     std::vector<edwards_Fq3> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/edwards/edwards_g2.hpp b/libff/algebra/curves/edwards/edwards_g2.hpp
index b6390b0b..09d2319d 100755
--- a/libff/algebra/curves/edwards/edwards_g2.hpp
+++ b/libff/algebra/curves/edwards/edwards_g2.hpp
@@ -78,6 +78,8 @@ class edwards_G2 {
 
     friend std::ostream& operator<<(std::ostream &out, const edwards_G2 &g);
     friend std::istream& operator>>(std::istream &in, edwards_G2 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<edwards_G2> &vec);
 };
 
 template<mp_size_t m>
@@ -92,10 +94,5 @@ edwards_G2 operator*(const Fp_model<m, modulus_p> &lhs, const edwards_G2 &rhs)
    return scalar_mul<edwards_G2, m>(rhs, lhs.as_bigint());
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<edwards_G2>(std::vector<edwards_G2> &vec);
-
 } // libff
 #endif // EDWARDS_G2_HPP_
diff --git a/libff/algebra/curves/mnt/mnt4/mnt4_g1.cpp b/libff/algebra/curves/mnt/mnt4/mnt4_g1.cpp
index d0ef8e0c..ef061caa 100755
--- a/libff/algebra/curves/mnt/mnt4/mnt4_g1.cpp
+++ b/libff/algebra/curves/mnt/mnt4/mnt4_g1.cpp
@@ -482,8 +482,7 @@ std::istream& operator>>(std::istream& in, std::vector<mnt4_G1> &v)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<mnt4_G1>(std::vector<mnt4_G1> &vec)
+void mnt4_G1::batch_to_special_all_non_zeros(std::vector<mnt4_G1> &vec)
 {
     std::vector<mnt4_Fq> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/mnt/mnt4/mnt4_g1.hpp b/libff/algebra/curves/mnt/mnt4/mnt4_g1.hpp
index 7dcf9832..a4dfc7e7 100755
--- a/libff/algebra/curves/mnt/mnt4/mnt4_g1.hpp
+++ b/libff/algebra/curves/mnt/mnt4/mnt4_g1.hpp
@@ -82,6 +82,8 @@ class mnt4_G1 {
 
     friend std::ostream& operator<<(std::ostream &out, const mnt4_G1 &g);
     friend std::istream& operator>>(std::istream &in, mnt4_G1 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<mnt4_G1> &vec);
 };
 
 template<mp_size_t m>
@@ -99,11 +101,6 @@ mnt4_G1 operator*(const Fp_model<m,modulus_p> &lhs, const mnt4_G1 &rhs)
 std::ostream& operator<<(std::ostream& out, const std::vector<mnt4_G1> &v);
 std::istream& operator>>(std::istream& in, std::vector<mnt4_G1> &v);
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<mnt4_G1>(std::vector<mnt4_G1> &vec);
-
 } // libff
 
 #endif // MNT4_G1_HPP_
diff --git a/libff/algebra/curves/mnt/mnt4/mnt4_g2.cpp b/libff/algebra/curves/mnt/mnt4/mnt4_g2.cpp
index 4d3856e2..757bba37 100755
--- a/libff/algebra/curves/mnt/mnt4/mnt4_g2.cpp
+++ b/libff/algebra/curves/mnt/mnt4/mnt4_g2.cpp
@@ -473,8 +473,7 @@ std::istream& operator>>(std::istream &in, mnt4_G2 &g)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<mnt4_G2>(std::vector<mnt4_G2> &vec)
+void mnt4_G2::batch_to_special_all_non_zeros(std::vector<mnt4_G2> &vec)
 {
     std::vector<mnt4_Fq2> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/mnt/mnt4/mnt4_g2.hpp b/libff/algebra/curves/mnt/mnt4/mnt4_g2.hpp
index fb794e34..9d403214 100755
--- a/libff/algebra/curves/mnt/mnt4/mnt4_g2.hpp
+++ b/libff/algebra/curves/mnt/mnt4/mnt4_g2.hpp
@@ -87,6 +87,8 @@ class mnt4_G2 {
 
     friend std::ostream& operator<<(std::ostream &out, const mnt4_G2 &g);
     friend std::istream& operator>>(std::istream &in, mnt4_G2 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<mnt4_G2> &vec);
 };
 
 template<mp_size_t m>
@@ -101,11 +103,6 @@ mnt4_G2 operator*(const Fp_model<m,modulus_p> &lhs, const mnt4_G2 &rhs)
     return scalar_mul<mnt4_G2, m>(rhs, lhs.as_bigint());
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<mnt4_G2>(std::vector<mnt4_G2> &vec);
-
 } // libff
 
 #endif // MNT4_G2_HPP_
diff --git a/libff/algebra/curves/mnt/mnt6/mnt6_g1.cpp b/libff/algebra/curves/mnt/mnt6/mnt6_g1.cpp
index 3c719786..aea73e4d 100755
--- a/libff/algebra/curves/mnt/mnt6/mnt6_g1.cpp
+++ b/libff/algebra/curves/mnt/mnt6/mnt6_g1.cpp
@@ -481,8 +481,7 @@ std::istream& operator>>(std::istream& in, std::vector<mnt6_G1> &v)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<mnt6_G1>(std::vector<mnt6_G1> &vec)
+void mnt6_G1::batch_to_special_all_non_zeros(std::vector<mnt6_G1> &vec)
 {
     std::vector<mnt6_Fq> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/mnt/mnt6/mnt6_g1.hpp b/libff/algebra/curves/mnt/mnt6/mnt6_g1.hpp
index 59ec8755..a2a08ea7 100755
--- a/libff/algebra/curves/mnt/mnt6/mnt6_g1.hpp
+++ b/libff/algebra/curves/mnt/mnt6/mnt6_g1.hpp
@@ -82,6 +82,8 @@ class mnt6_G1 {
 
     friend std::ostream& operator<<(std::ostream &out, const mnt6_G1 &g);
     friend std::istream& operator>>(std::istream &in, mnt6_G1 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<mnt6_G1> &vec);
 };
 
 template<mp_size_t m>
@@ -99,11 +101,6 @@ mnt6_G1 operator*(const Fp_model<m,modulus_p> &lhs, const mnt6_G1 &rhs)
 std::ostream& operator<<(std::ostream& out, const std::vector<mnt6_G1> &v);
 std::istream& operator>>(std::istream& in, std::vector<mnt6_G1> &v);
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<mnt6_G1>(std::vector<mnt6_G1> &vec);
-
 } // libff
 
 #endif // MNT6_G1_HPP_
diff --git a/libff/algebra/curves/mnt/mnt6/mnt6_g2.cpp b/libff/algebra/curves/mnt/mnt6/mnt6_g2.cpp
index 76103e50..1bcc4dd3 100755
--- a/libff/algebra/curves/mnt/mnt6/mnt6_g2.cpp
+++ b/libff/algebra/curves/mnt/mnt6/mnt6_g2.cpp
@@ -480,8 +480,7 @@ std::istream& operator>>(std::istream &in, mnt6_G2 &g)
     return in;
 }
 
-template<>
-void batch_to_special_all_non_zeros<mnt6_G2>(std::vector<mnt6_G2> &vec)
+void mnt6_G2::batch_to_special_all_non_zeros(std::vector<mnt6_G2> &vec)
 {
     std::vector<mnt6_Fq3> Z_vec;
     Z_vec.reserve(vec.size());
diff --git a/libff/algebra/curves/mnt/mnt6/mnt6_g2.hpp b/libff/algebra/curves/mnt/mnt6/mnt6_g2.hpp
index 80053193..2dc11e34 100755
--- a/libff/algebra/curves/mnt/mnt6/mnt6_g2.hpp
+++ b/libff/algebra/curves/mnt/mnt6/mnt6_g2.hpp
@@ -87,6 +87,8 @@ class mnt6_G2 {
 
     friend std::ostream& operator<<(std::ostream &out, const mnt6_G2 &g);
     friend std::istream& operator>>(std::istream &in, mnt6_G2 &g);
+
+    static void batch_to_special_all_non_zeros(std::vector<mnt6_G2> &vec);
 };
 
 template<mp_size_t m>
@@ -101,11 +103,6 @@ mnt6_G2 operator*(const Fp_model<m,modulus_p> &lhs, const mnt6_G2 &rhs)
     return scalar_mul<mnt6_G2, m>(rhs, lhs.as_bigint());
 }
 
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-template<>
-void batch_to_special_all_non_zeros<mnt6_G2>(std::vector<mnt6_G2> &vec);
-
 } // libff
 
 #endif // MNT6_G2_HPP_
diff --git a/libff/algebra/scalar_multiplication/multiexp.hpp b/libff/algebra/scalar_multiplication/multiexp.hpp
index 1838f20e..bbbdb040 100755
--- a/libff/algebra/scalar_multiplication/multiexp.hpp
+++ b/libff/algebra/scalar_multiplication/multiexp.hpp
@@ -12,52 +12,77 @@
 #ifndef MULTIEXP_HPP_
 #define MULTIEXP_HPP_
 
+#include <cstddef>
 #include <vector>
 
 namespace libff {
 
-/**
- * Naive multi-exponentiation individually multiplies each base by the
- * corresponding scalar and adds up the results.
- */
-template<typename T, typename FieldT>
-T naive_exp(typename std::vector<T>::const_iterator vec_start,
-            typename std::vector<T>::const_iterator vec_end,
-            typename std::vector<FieldT>::const_iterator scalar_start,
-            typename std::vector<FieldT>::const_iterator scalar_end);
-
-template<typename T, typename FieldT>
-T naive_plain_exp(typename std::vector<T>::const_iterator vec_start,
-                  typename std::vector<T>::const_iterator vec_end,
-                  typename std::vector<FieldT>::const_iterator scalar_start,
-                  typename std::vector<FieldT>::const_iterator scalar_end);
+enum multi_exp_method {
+ /**
+  * Naive multi-exponentiation individually multiplies each base by the
+  * corresponding scalar and adds up the results.
+  * multi_exp_method_naive uses opt_window_wnaf_exp for exponentiation,
+  * while multi_exp_method_plain uses operator *.
+  */
+ multi_exp_method_naive,
+ multi_exp_method_naive_plain,
+ /**
+  * A variant of the Bos-Coster algorithm [1],
+  * with implementation suggestions from [2].
+  *
+  * [1] = Bos and Coster, "Addition chain heuristics", CRYPTO '89
+  * [2] = Bernstein, Duif, Lange, Schwabe, and Yang, "High-speed high-security signatures", CHES '11
+  */
+ multi_exp_method_bos_coster,
+ /**
+  * A special case of Pippenger's algorithm from Page 15 of
+  * Bernstein, Doumen, Lange, Oosterwijk,
+  * "Faster batch forgery identification", INDOCRYPT 2012
+  * (https://eprint.iacr.org/2012/549.pdf)
+  * When compiled with USE_MIXED_ADDITION, assumes input is in special form.
+  * Requires that T implements .dbl() (and, if USE_MIXED_ADDITION is defined,
+  * .to_special(), .mixed_add(), and batch_to_special()).
+  */
+ multi_exp_method_BDLO12
+};
 
 /**
- * Naive multi-exponentiation uses a variant of the Bos-Coster algorithm [1],
- * and implementation suggestions from [2].
- *
- * [1] = Bos and Coster, "Addition chain heuristics", CRYPTO '89
- * [2] = Bernstein, Duif, Lange, Schwabe, and Yang, "High-speed high-security signatures", CHES '11
+ * Computes the sum
+ * \sum_i scalar_start[i] * vec_start[i]
+ * using the selected method.
+ * Input is split into the given number of chunks, and, when compiled with
+ * MULTICORE, the chunks are processed in parallel.
  */
-template<typename T, typename FieldT>
+template<typename T, typename FieldT, multi_exp_method Method>
 T multi_exp(typename std::vector<T>::const_iterator vec_start,
             typename std::vector<T>::const_iterator vec_end,
             typename std::vector<FieldT>::const_iterator scalar_start,
             typename std::vector<FieldT>::const_iterator scalar_end,
-            const size_t chunks,
-            const bool use_multiexp=false);
+            const size_t chunks);
 
 
 /**
- * A variant of multi_exp that takes advantage of the method mixed_add (instead of the operator '+').
+ * A variant of multi_exp that takes advantage of the method mixed_add (instead
+ * of the operator '+').
+ * Assumes input is in special form, and includes special pre-processing for
+ * scalars equal to 0 or 1.
  */
-template<typename T, typename FieldT>
+template<typename T, typename FieldT, multi_exp_method Method>
 T multi_exp_with_mixed_addition(typename std::vector<T>::const_iterator vec_start,
-                                  typename std::vector<T>::const_iterator vec_end,
-                                  typename std::vector<FieldT>::const_iterator scalar_start,
-                                  typename std::vector<FieldT>::const_iterator scalar_end,
-                                  const size_t chunks,
-                                  const bool use_multiexp);
+                                typename std::vector<T>::const_iterator vec_end,
+                                typename std::vector<FieldT>::const_iterator scalar_start,
+                                typename std::vector<FieldT>::const_iterator scalar_end,
+                                const size_t chunks);
+
+/**
+ * A convenience function for calculating a pure inner product, where the
+ * more complicated methods are not required.
+ */
+template <typename T>
+T inner_product(typename std::vector<T>::const_iterator a_start,
+                typename std::vector<T>::const_iterator a_end,
+                typename std::vector<T>::const_iterator b_start,
+                typename std::vector<T>::const_iterator b_end);
 
 /**
  * A window table stores window sizes for different instance sizes for fixed-base multi-scalar multiplications.
@@ -98,10 +123,6 @@ std::vector<T> batch_exp_with_coeff(const size_t scalar_size,
                                     const FieldT &coeff,
                                     const std::vector<FieldT> &v);
 
-// defined in every curve
-template<typename T>
-void batch_to_special_all_non_zeros(std::vector<T> &vec);
-
 template<typename T>
 void batch_to_special(std::vector<T> &vec);
 
diff --git a/libff/algebra/scalar_multiplication/multiexp.tcc b/libff/algebra/scalar_multiplication/multiexp.tcc
index 9ffc4422..69eb184d 100755
--- a/libff/algebra/scalar_multiplication/multiexp.tcc
+++ b/libff/algebra/scalar_multiplication/multiexp.tcc
@@ -18,7 +18,9 @@
 #include <cassert>
 #include <type_traits>
 
+#include <libff/algebra/fields/bigint.hpp>
 #include <libff/algebra/fields/fp_aux.tcc>
+#include <libff/algebra/scalar_multiplication/multiexp.hpp>
 #include <libff/algebra/scalar_multiplication/wnaf.hpp>
 #include <libff/common/profiling.hpp>
 #include <libff/common/utils.hpp>
@@ -102,11 +104,26 @@ public:
     }
 };
 
-template<typename T, typename FieldT>
-T naive_exp(typename std::vector<T>::const_iterator vec_start,
-            typename std::vector<T>::const_iterator vec_end,
-            typename std::vector<FieldT>::const_iterator scalar_start,
-            typename std::vector<FieldT>::const_iterator scalar_end)
+/**
+ * multi_exp_inner<T, FieldT, Method>() implementes the specified
+ * multiexponentiation method.
+ * this implementation relies on some rather arcane template magic:
+ * function templates cannot be partially specialized, so we cannot just write
+ *     template<typename T, typename FieldT>
+ *     T multi_exp_inner<T, FieldT, multi_exp_method_naive>
+ * thus we resort to using std::enable_if. the basic idea is that *overloading*
+ * is what's actually happening here, it's just that, for any given value of
+ * Method, only one of the templates will be valid, and thus the correct
+ * implementation will be used.
+ */
+
+template<typename T, typename FieldT, multi_exp_method Method,
+    typename std::enable_if<(Method == multi_exp_method_naive), int>::type = 0>
+T multi_exp_inner(
+    typename std::vector<T>::const_iterator vec_start,
+    typename std::vector<T>::const_iterator vec_end,
+    typename std::vector<FieldT>::const_iterator scalar_start,
+    typename std::vector<FieldT>::const_iterator scalar_end)
 {
     T result(T::zero());
 
@@ -123,11 +140,13 @@ T naive_exp(typename std::vector<T>::const_iterator vec_start,
     return result;
 }
 
-template<typename T, typename FieldT>
-T naive_plain_exp(typename std::vector<T>::const_iterator vec_start,
-                  typename std::vector<T>::const_iterator vec_end,
-                  typename std::vector<FieldT>::const_iterator scalar_start,
-                  typename std::vector<FieldT>::const_iterator scalar_end)
+template<typename T, typename FieldT, multi_exp_method Method,
+    typename std::enable_if<(Method == multi_exp_method_naive_plain), int>::type = 0>
+T multi_exp_inner(
+    typename std::vector<T>::const_iterator vec_start,
+    typename std::vector<T>::const_iterator vec_end,
+    typename std::vector<FieldT>::const_iterator scalar_start,
+    typename std::vector<FieldT>::const_iterator scalar_end)
 {
     T result(T::zero());
 
@@ -143,17 +162,132 @@ T naive_plain_exp(typename std::vector<T>::const_iterator vec_start,
     return result;
 }
 
-/*
-  The multi-exponentiation algorithm below is a variant of the Bos-Coster algorithm
-  [Bos and Coster, "Addition chain heuristics", CRYPTO '89].
-  The implementation uses suggestions from
-  [Bernstein, Duif, Lange, Schwabe, and Yang, "High-speed high-security signatures", CHES '11].
-*/
-template<typename T, typename FieldT>
-T multi_exp_inner(typename std::vector<T>::const_iterator vec_start,
-                  typename std::vector<T>::const_iterator vec_end,
-                  typename std::vector<FieldT>::const_iterator scalar_start,
-                  typename std::vector<FieldT>::const_iterator scalar_end)
+template<typename T, typename FieldT, multi_exp_method Method,
+    typename std::enable_if<(Method == multi_exp_method_BDLO12), int>::type = 0>
+T multi_exp_inner(
+    typename std::vector<T>::const_iterator bases,
+    typename std::vector<T>::const_iterator bases_end,
+    typename std::vector<FieldT>::const_iterator exponents,
+    typename std::vector<FieldT>::const_iterator exponents_end)
+{
+    UNUSED(exponents_end);
+    size_t length = bases_end - bases;
+
+    // empirically, this seems to be a decent estimate of the optimal value of c
+    size_t log2_length = log2(length);
+    size_t c = log2_length - (log2_length / 3 - 2);
+
+    const mp_size_t exp_num_limbs =
+        std::remove_reference<decltype(*exponents)>::type::num_limbs;
+    std::vector<bigint<exp_num_limbs> > bn_exponents(length);
+    size_t num_bits = 0;
+
+    for (size_t i = 0; i < length; i++)
+    {
+        bn_exponents[i] = exponents[i].as_bigint();
+        num_bits = std::max(num_bits, bn_exponents[i].num_bits());
+    }
+
+    size_t num_groups = (num_bits + c - 1) / c;
+
+    T result;
+    bool result_nonzero = false;
+
+    for (size_t k = num_groups - 1; k <= num_groups; k--)
+    {
+        if (result_nonzero)
+        {
+            for (size_t i = 0; i < c; i++)
+            {
+                result = result.dbl();
+            }
+        }
+
+        std::vector<T> buckets(1 << c);
+        std::vector<bool> bucket_nonzero(1 << c);
+
+        for (size_t i = 0; i < length; i++)
+        {
+            size_t id = 0;
+            for (size_t j = 0; j < c; j++)
+            {
+                if (bn_exponents[i].test_bit(k*c + j))
+                {
+                    id |= 1 << j;
+                }
+            }
+
+            if (id == 0)
+            {
+                continue;
+            }
+
+            if (bucket_nonzero[id])
+            {
+#ifdef USE_MIXED_ADDITION
+                buckets[id] = buckets[id].mixed_add(bases[i]);
+#else
+                buckets[id] = buckets[id] + bases[i];
+#endif
+            }
+            else
+            {
+                buckets[id] = bases[i];
+                bucket_nonzero[id] = true;
+            }
+        }
+
+#ifdef USE_MIXED_ADDITION
+        batch_to_special(buckets);
+#endif
+
+        T running_sum;
+        bool running_sum_nonzero = false;
+
+        for (size_t i = (1u << c) - 1; i > 0; i--)
+        {
+            if (bucket_nonzero[i])
+            {
+                if (running_sum_nonzero)
+                {
+#ifdef USE_MIXED_ADDITION
+                    running_sum = running_sum.mixed_add(buckets[i]);
+#else
+                    running_sum = running_sum + buckets[i];
+#endif
+                }
+                else
+                {
+                    running_sum = buckets[i];
+                    running_sum_nonzero = true;
+                }
+            }
+
+            if (running_sum_nonzero)
+            {
+                if (result_nonzero)
+                {
+                    result = result + running_sum;
+                }
+                else
+                {
+                    result = running_sum;
+                    result_nonzero = true;
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+template<typename T, typename FieldT, multi_exp_method Method,
+    typename std::enable_if<(Method == multi_exp_method_bos_coster), int>::type = 0>
+T multi_exp_inner(
+    typename std::vector<T>::const_iterator vec_start,
+    typename std::vector<T>::const_iterator vec_end,
+    typename std::vector<FieldT>::const_iterator scalar_start,
+    typename std::vector<FieldT>::const_iterator scalar_end)
 {
     const mp_size_t n = std::remove_reference<decltype(*scalar_start)>::type::num_limbs;
 
@@ -265,49 +399,35 @@ T multi_exp_inner(typename std::vector<T>::const_iterator vec_start,
     return opt_result;
 }
 
-template<typename T, typename FieldT>
+template<typename T, typename FieldT, multi_exp_method Method>
 T multi_exp(typename std::vector<T>::const_iterator vec_start,
             typename std::vector<T>::const_iterator vec_end,
             typename std::vector<FieldT>::const_iterator scalar_start,
             typename std::vector<FieldT>::const_iterator scalar_end,
-            const size_t chunks,
-            const bool use_multiexp)
+            const size_t chunks)
 {
     const size_t total = vec_end - vec_start;
-    if (total < chunks)
+    if ((total < chunks) || (chunks == 1))
     {
-        return naive_exp<T, FieldT>(vec_start, vec_end, scalar_start, scalar_end);
+        // no need to split into "chunks", can call implementation directly
+        return multi_exp_inner<T, FieldT, Method>(
+            vec_start, vec_end, scalar_start, scalar_end);
     }
 
     const size_t one = total/chunks;
 
     std::vector<T> partial(chunks, T::zero());
 
-    if (use_multiexp)
-    {
 #ifdef MULTICORE
 #pragma omp parallel for
 #endif
-        for (size_t i = 0; i < chunks; ++i)
-        {
-            partial[i] = multi_exp_inner<T, FieldT>(vec_start + i*one,
-                                                    (i == chunks-1 ? vec_end : vec_start + (i+1)*one),
-                                                    scalar_start + i*one,
-                                                    (i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
-        }
-    }
-    else
+    for (size_t i = 0; i < chunks; ++i)
     {
-#ifdef MULTICORE
-#pragma omp parallel for
-#endif
-        for (size_t i = 0; i < chunks; ++i)
-        {
-            partial[i] = naive_exp<T, FieldT>(vec_start + i*one,
-                                              (i == chunks-1 ? vec_end : vec_start + (i+1)*one),
-                                              scalar_start + i*one,
-                                              (i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
-        }
+        partial[i] = multi_exp_inner<T, FieldT, Method>(
+             vec_start + i*one,
+             (i == chunks-1 ? vec_end : vec_start + (i+1)*one),
+             scalar_start + i*one,
+             (i == chunks-1 ? scalar_end : scalar_start + (i+1)*one));
     }
 
     T final = T::zero();
@@ -320,13 +440,12 @@ T multi_exp(typename std::vector<T>::const_iterator vec_start,
     return final;
 }
 
-template<typename T, typename FieldT>
+template<typename T, typename FieldT, multi_exp_method Method>
 T multi_exp_with_mixed_addition(typename std::vector<T>::const_iterator vec_start,
                                 typename std::vector<T>::const_iterator vec_end,
                                 typename std::vector<FieldT>::const_iterator scalar_start,
                                 typename std::vector<FieldT>::const_iterator scalar_end,
-                                const size_t chunks,
-                                const bool use_multiexp)
+                                const size_t chunks)
 {
     assert(std::distance(vec_start, vec_end) == std::distance(scalar_start, scalar_end));
     enter_block("Process scalar vector");
@@ -373,7 +492,18 @@ T multi_exp_with_mixed_addition(typename std::vector<T>::const_iterator vec_star
 
     leave_block("Process scalar vector");
 
-    return acc + multi_exp<T, FieldT>(g.begin(), g.end(), p.begin(), p.end(), chunks, use_multiexp);
+    return acc + multi_exp<T, FieldT, Method>(g.begin(), g.end(), p.begin(), p.end(), chunks);
+}
+
+template <typename T>
+T inner_product(typename std::vector<T>::const_iterator a_start,
+                typename std::vector<T>::const_iterator a_end,
+                typename std::vector<T>::const_iterator b_start,
+                typename std::vector<T>::const_iterator b_end)
+{
+    return multi_exp<T, T, multi_exp_method_naive_plain>(
+        a_start, a_end,
+        b_start, b_end, 1);
 }
 
 template<typename T>
@@ -564,7 +694,7 @@ void batch_to_special(std::vector<T> &vec)
         }
     }
 
-    batch_to_special_all_non_zeros<T>(non_zero_vec);
+    T::batch_to_special_all_non_zeros(non_zero_vec);
     auto it = non_zero_vec.begin();
     T zero_special = T::zero();
     zero_special.to_special();
diff --git a/libff/algebra/scalar_multiplication/multiexp_profile.cpp b/libff/algebra/scalar_multiplication/multiexp_profile.cpp
new file mode 100644
index 00000000..0434bb25
--- /dev/null
+++ b/libff/algebra/scalar_multiplication/multiexp_profile.cpp
@@ -0,0 +1,128 @@
+#include <cstdio>
+#include <vector>
+
+#include <libff/algebra/curves/bn128/bn128_pp.hpp>
+#include <libff/algebra/scalar_multiplication/multiexp.hpp>
+#include <libff/common/profiling.hpp>
+#include <libff/common/rng.hpp>
+
+using namespace libff;
+
+template <typename GroupT>
+using run_result_t = std::pair<long long, std::vector<GroupT> >;
+
+template <typename T>
+using test_instances_t = std::vector<std::vector<T> >;
+
+template<typename GroupT>
+test_instances_t<GroupT> generate_group_elements(size_t count, size_t size)
+{
+    // generating a random group element is expensive,
+    // so for now we only generate a single one and repeat it
+    test_instances_t<GroupT> result(count);
+
+    for (size_t i = 0; i < count; i++) {
+        GroupT x = GroupT::random_element();
+        x.to_special(); // djb requires input to be in special form
+        for (size_t j = 0; j < size; j++) {
+            result[i].push_back(x);
+            // result[i].push_back(GroupT::random_element());
+        }
+    }
+
+    return result;
+}
+
+template<typename FieldT>
+test_instances_t<FieldT> generate_scalars(size_t count, size_t size)
+{
+    // we use SHA512_rng because it is much faster than
+    // FieldT::random_element()
+    test_instances_t<FieldT> result(count);
+
+    for (size_t i = 0; i < count; i++) {
+        for (size_t j = 0; j < size; j++) {
+            result[i].push_back(SHA512_rng<FieldT>(i * size + j));
+        }
+    }
+
+    return result;
+}
+
+template<typename GroupT, typename FieldT, multi_exp_method Method>
+run_result_t<GroupT> profile_multiexp(
+    test_instances_t<GroupT> group_elements,
+    test_instances_t<FieldT> scalars)
+{
+    long long start_time = get_nsec_time();
+
+    std::vector<GroupT> answers;
+    for (size_t i = 0; i < group_elements.size(); i++) {
+        answers.push_back(multi_exp<GroupT, FieldT, Method>(
+            group_elements[i].cbegin(), group_elements[i].cend(),
+            scalars[i].cbegin(), scalars[i].cend(),
+            1));
+    }
+
+    long long time_delta = get_nsec_time() - start_time;
+
+    return run_result_t<GroupT>(time_delta, answers);
+}
+
+template<typename GroupT, typename FieldT>
+void print_performance_csv(
+    size_t expn_start,
+    size_t expn_end_fast,
+    size_t expn_end_naive,
+    bool compare_answers)
+{
+    for (size_t expn = expn_start; expn <= expn_end_fast; expn++) {
+        printf("%ld", expn); fflush(stdout);
+
+        test_instances_t<GroupT> group_elements =
+            generate_group_elements<GroupT>(10, 1 << expn);
+        test_instances_t<FieldT> scalars =
+            generate_scalars<FieldT>(10, 1 << expn);
+
+        run_result_t<GroupT> result_bos_coster =
+            profile_multiexp<GroupT, FieldT, multi_exp_method_bos_coster>(
+                group_elements, scalars);
+        printf("\t%lld", result_bos_coster.first); fflush(stdout);
+
+        run_result_t<GroupT> result_djb =
+            profile_multiexp<GroupT, FieldT, multi_exp_method_BDLO12>(
+                group_elements, scalars);
+        printf("\t%lld", result_djb.first); fflush(stdout);
+
+        if (compare_answers && (result_bos_coster.second != result_djb.second)) {
+            fprintf(stderr, "Answers NOT MATCHING (bos coster != djb)\n");
+        }
+
+        if (expn <= expn_end_naive) {
+            run_result_t<GroupT> result_naive =
+                profile_multiexp<GroupT, FieldT, multi_exp_method_naive>(
+                    group_elements, scalars);
+            printf("\t%lld", result_naive.first); fflush(stdout);
+
+            if (compare_answers && (result_bos_coster.second != result_naive.second)) {
+                fprintf(stderr, "Answers NOT MATCHING (bos coster != naive)\n");
+            }
+        }
+
+        printf("\n");
+    }
+}
+
+int main(void)
+{
+    print_compilation_info();
+
+    printf("Profiling BN128_G1\n");
+    bn128_pp::init_public_params();
+    print_performance_csv<G1<bn128_pp>, Fr<bn128_pp> >(2, 20, 14, true);
+
+    printf("Profiling BN128_G2\n");
+    print_performance_csv<G2<bn128_pp>, Fr<bn128_pp> >(2, 20, 14, true);
+
+    return 0;
+}