From 9a3743c7c444cbc264d8ba0417c7bd7106d2665a Mon Sep 17 00:00:00 2001
From: Sean Bowe <ewillbefull@gmail.com>
Date: Mon, 3 Apr 2017 21:41:38 -0600
Subject: [PATCH] Various improvements to BLS implementation:

* Switch from rayon to crossbeam
* Allow windows to be reused per batch exp
* Allow batchexp to take vector by value
* Allow access to thread-local engine context
* Allow cloning of Engine
* Clean up wNAF abstractions to reduce heap allocation
---
 Cargo.toml                     |   3 +-
 src/curves/bls381/ec.rs        |  21 +++-
 src/curves/bls381/fp.rs        |   2 +
 src/curves/bls381/mod.rs       | 201 ++++++++++++++++++---------------
 src/curves/bls381/tests/mod.rs |   4 +-
 src/curves/mod.rs              |  63 +++++++++--
 src/curves/tests/mod.rs        |  12 +-
 7 files changed, 201 insertions(+), 105 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 31092f6..85aaa76 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,9 +10,10 @@ version = "0.0.1"
 
 [dependencies]
 rand = "0.3.*"
-rayon = "0.6.*"
 byteorder = "1.*"
 serde = "0.9.*"
+crossbeam = "0.2"
+num_cpus = "1.0"
 
 [dev-dependencies.bincode]
 git = "https://github.com/TyOverby/bincode.git"
diff --git a/src/curves/bls381/ec.rs b/src/curves/bls381/ec.rs
index 0820b9d..f0a7652 100644
--- a/src/curves/bls381/ec.rs
+++ b/src/curves/bls381/ec.rs
@@ -26,11 +26,13 @@ macro_rules! curve_impl {
             z: $basefield
         }
 
+        #[derive(Clone)]
         struct $params_name {
             zero: $name,
             one: $name,
             coeff_b: $basefield,
-            windows: Vec<usize>
+            windows: Vec<usize>,
+            batch_windows: (usize, Vec<usize>)
         }
 
         impl Convert<$name_affine, $engine> for $name {
@@ -130,6 +132,23 @@ macro_rules! curve_impl {
                 None
             }
 
+            fn optimal_window_batch(&self, engine: &$engine, scalars: usize) -> WindowTable<$engine, $name, Vec<$name>> {
+                let mut window = engine.$params_field.batch_windows.0;
+
+                for i in &engine.$params_field.batch_windows.1 {
+                    if scalars >= *i {
+                        window += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                let mut table = WindowTable::new();
+                table.set_base(engine, self, window);
+
+                table
+            }
+
             fn zero(engine: &$engine) -> Self {
                 engine.$params_field.zero
             }
diff --git a/src/curves/bls381/fp.rs b/src/curves/bls381/fp.rs
index ee4a45d..0612e1a 100644
--- a/src/curves/bls381/fp.rs
+++ b/src/curves/bls381/fp.rs
@@ -11,6 +11,7 @@ macro_rules! fp_params_impl {
         modulus_minus_1_over_2 = $modulus_minus_1_over_2:expr,
         inv = $inv:expr
     ) => {
+        #[derive(Clone)]
         struct $params_name {
             modulus: [u64; $limbs],
             r1: $name,
@@ -57,6 +58,7 @@ macro_rules! fp_params_impl {
         t_plus_1_over_2 = $t_plus_1_over_2:expr,
         inv = $inv:expr
     ) => {
+        #[derive(Clone)]
         struct $params_name {
             modulus: [u64; $limbs],
             r1: $name,
diff --git a/src/curves/bls381/mod.rs b/src/curves/bls381/mod.rs
index 05e843a..7a2a8ee 100644
--- a/src/curves/bls381/mod.rs
+++ b/src/curves/bls381/mod.rs
@@ -3,6 +3,7 @@ use std::fmt;
 
 use std::borrow::Borrow;
 use super::{
+    WindowTable,
     Engine,
     Group,
     GroupAffine,
@@ -690,6 +691,7 @@ impl G2Prepared {
     }
 }
 
+#[derive(Clone)]
 pub struct Bls381 {
     fqparams: FqParams,
     frparams: FrParams,
@@ -724,6 +726,12 @@ impl Engine for Bls381 {
     type G1 = G1;
     type G2 = G2;
 
+    fn with<R, F: for<'a> FnOnce(&'a Self) -> R>(cb: F) -> R {
+        ENGINE.with(|e| {
+            cb(e)
+        })
+    }
+
     fn new() -> Bls381 {
         let mut tmp = Bls381 {
             fqparams: FqParams::partial_init(),
@@ -732,13 +740,15 @@ impl Engine for Bls381 {
                 zero: G1 { x: Fq::zero(), y: Fq::zero(), z: Fq::zero() },
                 one: G1 { x: Fq::zero(), y: Fq::zero(), z: Fq::zero() },
                 coeff_b: Fq::zero(),
-                windows: vec![11, 35, 110]
+                windows: vec![11, 35, 110],
+                batch_windows: (4, vec![2, 3, 10, 20, 53, 111, 266, 426, 1273, 4742, 6054, 6054, 6054])
             },
             g2params: G2Params {
                 zero: G2 { x: Fq2::zero(), y: Fq2::zero(), z: Fq2::zero() },
                 one: G2 { x: Fq2::zero(), y: Fq2::zero(), z: Fq2::zero() },
                 coeff_b: Fq2::zero(),
-                windows: vec![11, 35, 114]
+                windows: vec![11, 35, 114],
+                batch_windows: (4, vec![2, 4, 10, 29, 54, 120, 314, 314, 314, 314])
             },
             frobenius_coeff_fq2: [Fq::zero(); 2],
             frobenius_coeff_fq6_c1: [Fq2::zero(); 6],
@@ -999,36 +1009,58 @@ impl Engine for Bls381 {
         f
     }
 
-    fn batch_baseexp<G: Group<Self>>(&self, base: &G, s: &[Fr]) -> Vec<G::Affine>
+    fn batch_baseexp<G: Group<Self>, S: AsRef<[Self::Fr]>>(&self, table: &WindowTable<Self, G, Vec<G>>, s: S) -> Vec<G::Affine>
     {
-        // TODO: pick an optimal window size based on number of elements and
-        // considering the exact group
-        const WINDOW_SIZE_BASE: usize = 18;
+        use crossbeam;
+        use num_cpus;
+
+        let s = s.as_ref();
+        let mut ret = vec![G::zero(self).to_affine(self); s.len()];
 
-        use rayon::prelude::*;
+        crossbeam::scope(|scope| {
+            let chunk = (s.len() / num_cpus::get()) + 1;
 
-        let mut table = vec![];
-        window_table(self, WINDOW_SIZE_BASE, base, &mut table);
+            for (s, b) in s.chunks(chunk).zip(ret.chunks_mut(chunk)) {
+                let mut table = table.shared();
 
-        s.par_iter().map(|s| {
-            let mut b = G::zero(self);
-            windowed_exp(self, WINDOW_SIZE_BASE, &table, &mut b, &s.into_repr(self));
-            b.to_affine(self)
-        }).collect()
+                scope.spawn(move || {
+                    for (s, b) in s.iter().zip(b.iter_mut()) {
+                        let mut tmp = G::zero(self);
+                        table.exp(self, &mut tmp, s.into_repr(self));
+                        *b = tmp.to_affine(self);
+                    }
+                });
+            }
+        });
+
+        ret
     }
 
-    fn multiexp<G: Group<Self>>(&self, g: &[G::Affine], s: &[Fr]) -> G {
-        use rayon::prelude::*;
-        use rayon::par_iter::zip::ZipIter;
-
-        return ZipIter::new(
-            g.par_chunks((g.len() / 32) + 1),
-            s.par_chunks((g.len() / 32) + 1)
-        ).map(|(g, s)| {
-            multiexp_inner::<G>(self, g, s)
-        }).reduce(|| G::zero(self), |mut a, b| {
-            a.add_assign(self, &b);
-            a
+    fn multiexp<G: Group<Self>>(&self, g: &[G::Affine], s: &[Fr]) -> Result<G, ()> {
+        if g.len() != s.len() {
+            return Err(());
+        }
+
+        use crossbeam;
+        use num_cpus;
+
+        return crossbeam::scope(|scope| {
+            let mut threads = vec![];
+
+            let chunk = (s.len() / num_cpus::get()) + 1;
+
+            for (g, s) in g.chunks(chunk).zip(s.chunks(chunk)) {
+                threads.push(scope.spawn(move || {
+                    multiexp_inner(self, g, s)
+                }));
+            }
+
+            let mut acc = G::zero(self);
+            for t in threads {
+                acc.add_assign(self, &t.join());
+            }
+
+            Ok(acc)
         });
 
         fn multiexp_inner<G: Group<Bls381>>(engine: &Bls381, g: &[G::Affine], s: &[Fr]) -> G
@@ -1132,7 +1164,7 @@ impl Engine for Bls381 {
                 });
             }
 
-            let mut table_space = vec![];
+            let mut table = WindowTable::new();
 
             while let Some(mut greatest) = heap.pop() {
                 {
@@ -1140,7 +1172,7 @@ impl Engine for Bls381 {
                     if second_greatest.is_none() || greatest.justexp(second_greatest.unwrap()) {
                         // Either this is the last value or multiplying is considered more efficient than
                         // rewriting and reinsertion into the heap.
-                        opt_exp(engine, &mut elements[greatest.index], &greatest.value, &mut table_space);
+                        opt_exp(engine, &mut elements[greatest.index], greatest.value, &mut table);
                         result.add_assign(engine, &elements[greatest.index]);
                         continue;
                     } else {
@@ -1164,87 +1196,72 @@ impl Engine for Bls381 {
     }
 }
 
-// Converts a scalar into wNAF form based on given window size.
-// TODO: instead of a function, and allocating a vector, create a smart
-// iterator.
-fn wnaf(e: &Bls381, window: usize, s: &<Fr as PrimeField<Bls381>>::Repr) -> Vec<i64>
-{
-    use std::default::Default;
-    let mut res = Vec::with_capacity(Fr::num_bits(e) + 1);
-    let mut c = *s;
+impl<G: Group<Bls381>, B: Borrow<[G]>> WindowTable<Bls381, G, B> {
+    fn exp(&mut self, e: &Bls381, into: &mut G, mut c: <Fr as PrimeField<Bls381>>::Repr) {
+        assert!(self.window > 1);
 
-    let mut tmp = <Fr as PrimeField<Bls381>>::Repr::default();
+        self.wnaf.truncate(0);
+        self.wnaf.reserve(Fr::num_bits(e) + 1);
 
-    while !c.iter().all(|&e| e==0) {
-        let mut u;
-        if fr_arith::odd(&c) {
-            u = (c[0] % (1 << (window+1))) as i64;
+        // Convert the scalar `c` into wNAF form.
+        {
+            use std::default::Default;
+            let mut tmp = <Fr as PrimeField<Bls381>>::Repr::default();
 
-            if u > (1 << window) {
-                u -= 1 << (window+1);
-            }
+            while !c.iter().all(|&e| e==0) {
+                let mut u;
+                if fr_arith::odd(&c) {
+                    u = (c[0] % (1 << (self.window+1))) as i64;
+
+                    if u > (1 << self.window) {
+                        u -= 1 << (self.window+1);
+                    }
+
+                    if u > 0 {
+                        tmp[0] = u as u64;
+                        fr_arith::sub_noborrow(&mut c, &tmp);
+                    } else {
+                        tmp[0] = (-u) as u64;
+                        fr_arith::add_nocarry(&mut c, &tmp);
+                    }
+                } else {
+                    u = 0;
+                }
+
+                self.wnaf.push(u);
 
-            if u > 0 {
-                tmp[0] = u as u64;
-                fr_arith::sub_noborrow(&mut c, &tmp);
-            } else {
-                tmp[0] = (-u) as u64;
-                fr_arith::add_nocarry(&mut c, &tmp);
+                fr_arith::div2(&mut c);
             }
-        } else {
-            u = 0;
         }
 
-        res.push(u);
+        // Perform wNAF exponentiation.
+        *into = G::zero(e);
 
-        fr_arith::div2(&mut c);
-    }
+        for n in self.wnaf.iter().rev() {
+            into.double(e);
 
-    res
+            if *n != 0 {
+                if *n > 0 {
+                    into.add_assign(e, &self.table.borrow()[(n/2) as usize]);
+                } else {
+                    into.sub_assign(e, &self.table.borrow()[((-n)/2) as usize]);
+                }
+            }
+        }
+    }
 }
 
 // Performs optimal exponentiation
-fn opt_exp<G: Group<Bls381>>(e: &Bls381, base: &mut G, scalar: &<Fr as PrimeField<Bls381>>::Repr, table: &mut Vec<G>)
+fn opt_exp<G: Group<Bls381>>(e: &Bls381, base: &mut G, scalar: <Fr as PrimeField<Bls381>>::Repr, table: &mut WindowTable<Bls381, G, Vec<G>>)
 {
-    let bits = fr_arith::num_bits(scalar);
+    let bits = fr_arith::num_bits(&scalar);
     match G::optimal_window(e, bits) {
         Some(window) => {
-            window_table(e, window, base, table);
-            windowed_exp(e, window, &table, base, scalar);
+            table.set_base(e, base, window);
+            table.exp(e, base, scalar);
         },
         None => {
-            base.mul_assign(e, scalar);
-        }
-    }
-}
-
-fn window_table<G: Group<Bls381>>(e: &Bls381, window: usize, base: &G, table: &mut Vec<G>)
-{
-    table.truncate(0);
-
-    let mut tmp = *base;
-    let mut dbl = tmp;
-    dbl.double(e);
-
-    for _ in 0..(1 << (window-1)) {
-        table.push(tmp);
-        tmp.add_assign(e, &dbl);
-    }
-}
-
-fn windowed_exp<G: Group<Bls381>>(e: &Bls381, window: usize, table: &[G], base: &mut G, scalar: &<Fr as PrimeField<Bls381>>::Repr)
-{
-    *base = G::zero(e);
-
-    for n in wnaf(e, window, scalar).into_iter().rev() {
-        base.double(e);
-
-        if n != 0 {
-            if n > 0 {
-                base.add_assign(e, &table[(n/2) as usize]);
-            } else {
-                base.sub_assign(e, &table[((-n)/2) as usize]);
-            }
+            base.mul_assign(e, &scalar);
         }
     }
 }
diff --git a/src/curves/bls381/tests/mod.rs b/src/curves/bls381/tests/mod.rs
index befe4b7..99aca36 100644
--- a/src/curves/bls381/tests/mod.rs
+++ b/src/curves/bls381/tests/mod.rs
@@ -12,12 +12,12 @@ fn test_vectors<E: Engine, G: Group<E>>(e: &E, expected: &[u8]) {
         {
             let acc = acc.to_affine(e);
             let exp: <G::Affine as GroupAffine<E, G>>::Uncompressed =
-                bincode::deserialize_from(&mut expected_reader, bincode::SizeLimit::Infinite).unwrap();
+                bincode::deserialize_from(&mut expected_reader, bincode::Infinite).unwrap();
 
             assert!(acc == exp.to_affine(e).unwrap());
 
             let acc = acc.to_uncompressed(e);
-            bincode::serialize_into(&mut bytes, &acc, bincode::SizeLimit::Infinite).unwrap();
+            bincode::serialize_into(&mut bytes, &acc, bincode::Infinite).unwrap();
         }
         acc.double(e);
         acc.add_assign(e, &G::one(e));
diff --git a/src/curves/mod.rs b/src/curves/mod.rs
index f4c4309..21b93b7 100644
--- a/src/curves/mod.rs
+++ b/src/curves/mod.rs
@@ -1,11 +1,14 @@
 use rand;
 use std::fmt;
 
+use std::ops::Deref;
+use std::borrow::Borrow;
+use std::marker::PhantomData;
 use serde::{Serialize, Deserialize};
 
 pub mod bls381;
 
-pub trait Engine: Sized
+pub trait Engine: Sized + Clone
 {
     type Fq: PrimeField<Self>;
     type Fr: SnarkField<Self>;
@@ -16,6 +19,9 @@ pub trait Engine: Sized
 
     fn new() -> Self;
 
+    /// Operate over the thread-local engine instance
+    fn with<R, F: for<'a> FnOnce(&'a Self) -> R>(F) -> R;
+
     fn pairing<G1, G2>(&self, p: &G1, q: &G2) -> Self::Fqk
         where G1: Convert<<Self::G1 as Group<Self>>::Affine, Self>,
               G2: Convert<<Self::G2 as Group<Self>>::Affine, Self>
@@ -34,8 +40,9 @@ pub trait Engine: Sized
                                )>;
     fn final_exponentiation(&self, &Self::Fqk) -> Self::Fqk;
 
-    fn multiexp<G: Group<Self>>(&self, &[G::Affine], &[Self::Fr]) -> G;
-    fn batch_baseexp<G: Group<Self>>(&self, base: &G, scalars: &[Self::Fr]) -> Vec<G::Affine>;
+    /// Perform multi-exponentiation. g and s must have the same length.
+    fn multiexp<G: Group<Self>>(&self, g: &[G::Affine], s: &[Self::Fr]) -> Result<G, ()>;
+    fn batch_baseexp<G: Group<Self>, S: AsRef<[Self::Fr]>>(&self, table: &WindowTable<Self, G, Vec<G>>, scalars: S) -> Vec<G::Affine>;
 }
 
 pub trait Group<E: Engine>: Sized +
@@ -67,6 +74,7 @@ pub trait Group<E: Engine>: Sized +
     fn mul_assign<S: Convert<[u64], E>>(&mut self, &E, other: &S);
 
     fn optimal_window(&E, scalar_bits: usize) -> Option<usize>;
+    fn optimal_window_batch(&self, &E, scalars: usize) -> WindowTable<E, Self, Vec<Self>>;
 }
 
 pub trait GroupAffine<E: Engine, G: Group<E>>: Copy +
@@ -163,7 +171,7 @@ pub trait SqrtField<E: Engine>: Field<E>
 pub trait PrimeField<E: Engine>: SqrtField<E> + Convert<[u64], E>
 {
     /// Little endian representation of a field element.
-    type Repr: Convert<[u64], E>;
+    type Repr: Convert<[u64], E> + Eq + Clone;
     fn from_u64(&E, u64) -> Self;
     fn from_str(&E, s: &str) -> Result<Self, ()>;
     fn from_repr(&E, Self::Repr) -> Result<Self, ()>;
@@ -196,6 +204,50 @@ pub struct BitIterator<T> {
     n: usize
 }
 
+pub struct WindowTable<E, G, Table: Borrow<[G]>> {
+    table: Table,
+    wnaf: Vec<i64>,
+    window: usize,
+    _marker: PhantomData<(E, G)>
+}
+
+impl<E: Engine, G: Group<E>> WindowTable<E, G, Vec<G>> {
+    fn new() -> Self {
+        WindowTable {
+            table: vec![],
+            wnaf: vec![],
+            window: 0,
+            _marker: PhantomData
+        }
+    }
+
+    fn set_base(&mut self, e: &E, base: &G, window: usize) {
+        assert!(window > 1);
+
+        self.window = window;
+        self.table.truncate(0);
+        self.table.reserve(1 << (window-1));
+
+        let mut tmp = *base;
+        let mut dbl = tmp;
+        dbl.double(e);
+
+        for _ in 0..(1 << (window-1)) {
+            self.table.push(tmp);
+            tmp.add_assign(e, &dbl);
+        }
+    }
+
+    fn shared(&self) -> WindowTable<E, G, &[G]> {
+        WindowTable {
+            table: &self.table[..],
+            wnaf: vec![],
+            window: self.window,
+            _marker: PhantomData
+        }
+    }
+}
+
 impl<T: AsRef<[u64]>> Iterator for BitIterator<T> {
     type Item = bool;
 
@@ -224,9 +276,6 @@ impl<'a> From<&'a [u64]> for BitIterator<&'a [u64]>
     }
 }
 
-use std::ops::Deref;
-use std::borrow::Borrow;
-
 pub enum Cow<'a, T: 'a> {
     Owned(T),
     Borrowed(&'a T)
diff --git a/src/curves/tests/mod.rs b/src/curves/tests/mod.rs
index cced874..dec568f 100644
--- a/src/curves/tests/mod.rs
+++ b/src/curves/tests/mod.rs
@@ -24,7 +24,7 @@ fn test_multiexp<E: Engine, G: Group<E>>(e: &E) {
         let s: Vec<E::Fr> = (0..1000).map(|_| E::Fr::random(e, rng)).collect();
 
         let naive = naiveexp::<E, G>(e, &g, &s);
-        let multi = e.multiexp::<G>(&g, &s);
+        let multi = e.multiexp::<G>(&g, &s).unwrap();
 
         assert!(naive.is_equal(e, &multi));
         assert!(multi.is_equal(e, &naive));
@@ -36,11 +36,19 @@ fn test_multiexp<E: Engine, G: Group<E>>(e: &E) {
         let s = vec![E::Fr::from_str(e, "3435973836800000000000000000000000").unwrap(), E::Fr::from_str(e, "3435973836700000000000000000000000").unwrap()];
 
         let naive = naiveexp::<E, G>(e, &g, &s);
-        let multi = e.multiexp::<G>(&g, &s);
+        let multi = e.multiexp::<G>(&g, &s).unwrap();
 
         assert!(naive.is_equal(e, &multi));
         assert!(multi.is_equal(e, &naive));
     }
+
+    {
+        let rng = &mut rand::thread_rng();
+        let s = vec![E::Fr::one(e); 100];
+        let g = vec![G::random(e, rng).to_affine(e); 101];
+
+        assert!(e.multiexp::<G>(&g, &s).is_err());
+    }
 }
 
 fn test_bilinearity<E: Engine>(e: &E) {