diff --git a/Cargo.toml b/Cargo.toml
index e671beb..713dac6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,11 +9,17 @@ repository = "https://github.com/ebfull/bellman"
 version = "0.0.3"
 
 [dependencies]
-#rand = "0.3.*"
-#byteorder = "1.*"
-#serde = "1.0"
-#crossbeam = "0.2"
-#num_cpus = "1.0"
+rand = "0.3"
+bit-vec = "0.4.4"
+futures = "0.1"
+futures-cpupool = "0.1"
+num_cpus = "1.6"
+crossbeam = "0.3"
 
-[dev-dependencies]
-#bincode = "0.8.0"
+[dependencies.pairing]
+version = "0.11"
+features = ["unstable-wnaf"]
+
+[features]
+default = ["u128-support"]
+u128-support = ["pairing/u128-support"]
diff --git a/src/domain.rs b/src/domain.rs
new file mode 100644
index 0000000..40a0f20
--- /dev/null
+++ b/src/domain.rs
@@ -0,0 +1,457 @@
+use pairing::*;
+use super::{
+    Error
+};
+use crossbeam;
+use num_cpus;
+use multicore;
+
+const LARGEST_POLYNOMIAL_DEGREE: usize = 1 << 28;
+
+pub struct EvaluationDomain<E: Engine, G: Group<E>> {
+    coeffs: Vec<G>,
+    exp: u32,
+    omega: E::Fr,
+    omegainv: E::Fr,
+    geninv: E::Fr,
+    minv: E::Fr
+}
+
+impl<E: Engine, G: Group<E>> EvaluationDomain<E, G> {
+    pub fn as_ref(&self) -> &[G] {
+        &self.coeffs
+    }
+
+    pub fn into_coeffs(self) -> Vec<G> {
+        self.coeffs
+    }
+
+    pub fn as_mut(&mut self) -> &mut [G] {
+        &mut self.coeffs
+    }
+
+    pub fn from_coeffs(mut coeffs: Vec<G>) -> Result<EvaluationDomain<E, G>, Error>
+    {
+        if coeffs.len() > LARGEST_POLYNOMIAL_DEGREE {
+            return Err(Error::PolynomialDegreeTooLarge)
+        }
+
+        let mut m = 1;
+        let mut exp = 0;
+        while m < coeffs.len() {
+            m *= 2;
+            exp += 1;
+
+            if exp >= E::Fr::s() {
+                return Err(Error::PolynomialDegreeTooLarge)
+            }
+        }
+
+        let mut omega = E::Fr::root_of_unity();
+        for _ in exp..E::Fr::s() {
+            omega.square();
+        }
+
+        coeffs.resize(m, G::group_zero());
+
+        Ok(EvaluationDomain {
+            coeffs: coeffs,
+            exp: exp,
+            omega: omega,
+            omegainv: omega.inverse().unwrap(),
+            geninv: E::Fr::multiplicative_generator().inverse().unwrap(),
+            minv: E::Fr::from_str(&format!("{}", m)).unwrap().inverse().unwrap()
+        })
+    }
+
+    pub fn fft(&mut self)
+    {
+        best_fft(&mut self.coeffs, &self.omega, self.exp);
+    }
+
+    pub fn ifft(&mut self)
+    {
+        best_fft(&mut self.coeffs, &self.omegainv, self.exp);
+
+        multicore::scope(self.coeffs.len(), |scope, chunk| {
+            let minv = self.minv;
+
+            for v in self.coeffs.chunks_mut(chunk) {
+                scope.spawn(move || {
+                    for v in v {
+                        v.group_mul_assign(&minv);
+                    }
+                });
+            }
+        });
+    }
+
+    fn mul_coset(&mut self, g: E::Fr)
+    {
+        multicore::scope(self.coeffs.len(), |scope, chunk| {
+            for (i, v) in self.coeffs.chunks_mut(chunk).enumerate() {
+                scope.spawn(move || {
+                    let mut u = g.pow(&[(i * chunk) as u64]);
+                    for v in v.iter_mut() {
+                        v.group_mul_assign(&u);
+                        u.mul_assign(&g);
+                    }
+                });
+            }
+        });
+    }
+
+    pub fn coset_fft(&mut self)
+    {
+        self.mul_coset(E::Fr::multiplicative_generator());
+        self.fft();
+    }
+
+    pub fn icoset_fft(&mut self)
+    {
+        let geninv = self.geninv;
+
+        self.ifft();
+        self.mul_coset(geninv);
+    }
+
+    pub fn z(&self, tau: &E::Fr) -> E::Fr {
+        let mut tmp = tau.pow(&[self.coeffs.len() as u64]);
+        tmp.sub_assign(&E::Fr::one());
+
+        tmp
+    }
+
+    pub fn divide_by_z_on_coset(&mut self)
+    {
+        let i = self.z(&E::Fr::multiplicative_generator()).inverse().unwrap();
+
+        multicore::scope(self.coeffs.len(), |scope, chunk| {
+            for v in self.coeffs.chunks_mut(chunk) {
+                scope.spawn(move || {
+                    for v in v {
+                        v.group_mul_assign(&i);
+                    }
+                });
+            }
+        });
+    }
+
+    pub fn mul_assign(&mut self, other: &EvaluationDomain<E, Scalar<E>>) {
+        assert_eq!(self.coeffs.len(), other.coeffs.len());
+
+        multicore::scope(self.coeffs.len(), |scope, chunk| {
+            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
+                scope.spawn(move || {
+                    for (a, b) in a.iter_mut().zip(b.iter()) {
+                        a.group_mul_assign(&b.0);
+                    }
+                });
+            }
+        });
+    }
+
+    pub fn sub_assign(&mut self, other: &EvaluationDomain<E, G>) {
+        assert_eq!(self.coeffs.len(), other.coeffs.len());
+
+        multicore::scope(self.coeffs.len(), |scope, chunk| {
+            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
+                scope.spawn(move || {
+                    for (a, b) in a.iter_mut().zip(b.iter()) {
+                        a.group_sub_assign(&b);
+                    }
+                });
+            }
+        });
+    }
+}
+
+pub trait Group<E: Engine>: Sized + Copy + Clone + Send + Sync {
+    fn group_zero() -> Self;
+    fn group_mul_assign(&mut self, by: &E::Fr);
+    fn group_add_assign(&mut self, other: &Self);
+    fn group_sub_assign(&mut self, other: &Self);
+}
+
+pub struct Scalar<E: Engine>(pub E::Fr);
+
+impl<E: Engine> PartialEq for Scalar<E> {
+    fn eq(&self, other: &Scalar<E>) -> bool {
+        self.0 == other.0
+    }
+}
+
+impl<E: Engine> Copy for Scalar<E> { }
+
+impl<E: Engine> Clone for Scalar<E> {
+    fn clone(&self) -> Scalar<E> {
+        *self
+    }
+}
+
+impl<E: Engine> Group<E> for Scalar<E> {
+    fn group_zero() -> Self {
+        Scalar(E::Fr::zero())
+    }
+    fn group_mul_assign(&mut self, by: &E::Fr) {
+        self.0.mul_assign(by);
+    }
+    fn group_add_assign(&mut self, other: &Self) {
+        self.0.add_assign(&other.0);
+    }
+    fn group_sub_assign(&mut self, other: &Self) {
+        self.0.sub_assign(&other.0);
+    }
+}
+
+fn get_log_cpus() -> u32 {
+    let num = num_cpus::get();
+    log2_floor(num)
+}
+
+fn log2_floor(num: usize) -> u32 {
+    assert!(num > 0);
+
+    let mut pow = 0;
+
+    while (1 << (pow+1)) <= num {
+        pow += 1;
+    }
+
+    pow
+}
+
+#[test]
+fn test_log2_floor() {
+    assert_eq!(log2_floor(1), 0);
+    assert_eq!(log2_floor(2), 1);
+    assert_eq!(log2_floor(3), 1);
+    assert_eq!(log2_floor(4), 2);
+    assert_eq!(log2_floor(5), 2);
+    assert_eq!(log2_floor(6), 2);
+    assert_eq!(log2_floor(7), 2);
+    assert_eq!(log2_floor(8), 3);
+}
+
+fn best_fft<E: Engine, T: Group<E>>(a: &mut [T], omega: &E::Fr, log_n: u32)
+{
+    let log_cpus = get_log_cpus();
+
+    if log_n < log_cpus {
+        serial_fft(a, omega, log_n);
+    } else {
+        parallel_fft(a, omega, log_n, log_cpus);
+    }
+}
+
+fn serial_fft<E: Engine, T: Group<E>>(a: &mut [T], omega: &E::Fr, log_n: u32)
+{
+    fn bitreverse(mut n: u32, l: u32) -> u32 {
+        let mut r = 0;
+        for _ in 0..l {
+            r = (r << 1) | (n & 1);
+            n >>= 1;
+        }
+        r
+    }
+
+    let n = a.len() as u32;
+    assert_eq!(n, 1 << log_n);
+
+    for k in 0..n {
+        let rk = bitreverse(k, log_n);
+        if k < rk {
+            a.swap(rk as usize, k as usize);
+        }
+    }
+
+    let mut m = 1;
+    for _ in 0..log_n {
+        let w_m = omega.pow(&[(n / (2*m)) as u64]);
+
+        let mut k = 0;
+        while k < n {
+            let mut w = E::Fr::one();
+            for j in 0..m {
+                let mut t = a[(k+j+m) as usize];
+                t.group_mul_assign(&w);
+                let mut tmp = a[(k+j) as usize];
+                tmp.group_sub_assign(&t);
+                a[(k+j+m) as usize] = tmp;
+                a[(k+j) as usize].group_add_assign(&t);
+                w.mul_assign(&w_m);
+            }
+
+            k += 2*m;
+        }
+
+        m *= 2;
+    }
+}
+
+fn parallel_fft<E: Engine, T: Group<E>>(a: &mut [T], omega: &E::Fr, log_n: u32, log_cpus: u32)
+{
+    assert!(log_n >= log_cpus);
+
+    let num_cpus = 1 << log_cpus;
+    let log_new_n = log_n - log_cpus;
+    let mut tmp = vec![vec![T::group_zero(); 1 << log_new_n]; num_cpus];
+    let new_omega = omega.pow(&[num_cpus as u64]);
+
+    crossbeam::scope(|scope| {
+        let a = &*a;
+
+        for (j, tmp) in tmp.iter_mut().enumerate() {
+            scope.spawn(move || {
+                // Shuffle into a sub-FFT
+                let omega_j = omega.pow(&[j as u64]);
+                let omega_step = omega.pow(&[(j as u64) << log_new_n]);
+
+                let mut elt = E::Fr::one();
+                for i in 0..(1 << log_new_n) {
+                    for s in 0..num_cpus {
+                        let idx = (i + (s << log_new_n)) % (1 << log_n);
+                        let mut t = a[idx];
+                        t.group_mul_assign(&elt);
+                        tmp[i].group_add_assign(&t);
+                        elt.mul_assign(&omega_step);
+                    }
+                    elt.mul_assign(&omega_j);
+                }
+
+                // Perform sub-FFT
+                serial_fft(tmp, &new_omega, log_new_n);
+            });
+        }
+    });
+
+    // TODO: does this hurt or help?
+    multicore::scope(a.len(), |scope, chunk| {
+        let tmp = &tmp;
+
+        for (idx, a) in a.chunks_mut(chunk).enumerate() {
+            scope.spawn(move || {
+                let mut idx = idx * chunk;
+                let mask = (1 << log_cpus) - 1;
+                for a in a {
+                    *a = tmp[idx & mask][idx >> log_cpus];
+                    idx += 1;
+                }
+            });
+        }
+    });
+}
+
+// Test multiplying various (low degree) polynomials together and
+// comparing with naive evaluations.
+#[test]
+fn polynomial_arith() {
+    use pairing::bls12_381::Bls12;
+    use rand::{self, Rand};
+
+    fn test_mul<E: Engine, R: rand::Rng>(rng: &mut R)
+    {
+        for coeffs_a in 0..70 {
+            for coeffs_b in 0..70 {
+                let mut a: Vec<_> = (0..coeffs_a).map(|_| Scalar::<E>(E::Fr::rand(rng))).collect();
+                let mut b: Vec<_> = (0..coeffs_b).map(|_| Scalar::<E>(E::Fr::rand(rng))).collect();
+
+                // naive evaluation
+                let mut naive = vec![Scalar(E::Fr::zero()); coeffs_a + coeffs_b];
+                for (i1, a) in a.iter().enumerate() {
+                    for (i2, b) in b.iter().enumerate() {
+                        let mut prod = *a;
+                        prod.group_mul_assign(&b.0);
+                        naive[i1 + i2].group_add_assign(&prod);
+                    }
+                }
+
+                a.resize(coeffs_a + coeffs_b, Scalar(E::Fr::zero()));
+                b.resize(coeffs_a + coeffs_b, Scalar(E::Fr::zero()));
+
+                let mut a = EvaluationDomain::from_coeffs(a).unwrap();
+                let mut b = EvaluationDomain::from_coeffs(b).unwrap();
+
+                a.fft();
+                b.fft();
+                a.mul_assign(&b);
+                a.ifft();
+
+                for (naive, fft) in naive.iter().zip(a.coeffs.iter()) {
+                    assert!(naive == fft);
+                }
+            }
+        }
+    }
+
+    let rng = &mut rand::thread_rng();
+
+    test_mul::<Bls12, _>(rng);
+}
+
+#[test]
+fn fft_composition() {
+    use pairing::bls12_381::Bls12;
+    use rand;
+
+    fn test_comp<E: Engine, R: rand::Rng>(rng: &mut R)
+    {
+        for coeffs in 0..10 {
+            let coeffs = 1 << coeffs;
+
+            let mut v = vec![];
+            for _ in 0..coeffs {
+                v.push(Scalar::<E>(rng.gen()));
+            }
+
+            let mut domain = EvaluationDomain::from_coeffs(v.clone()).unwrap();
+            domain.ifft();
+            domain.fft();
+            assert!(v == domain.coeffs);
+            domain.fft();
+            domain.ifft();
+            assert!(v == domain.coeffs);
+            domain.icoset_fft();
+            domain.coset_fft();
+            assert!(v == domain.coeffs);
+            domain.coset_fft();
+            domain.icoset_fft();
+            assert!(v == domain.coeffs);
+        }
+    }
+
+    let rng = &mut rand::thread_rng();
+
+    test_comp::<Bls12, _>(rng);
+}
+
+#[test]
+fn parallel_fft_consistency() {
+    use pairing::bls12_381::Bls12;
+    use rand::{self, Rand};
+    use std::cmp::min;
+
+    fn test_consistency<E: Engine, R: rand::Rng>(rng: &mut R)
+    {
+        for _ in 0..5 {
+            for log_d in 0..10 {
+                let d = 1 << log_d;
+
+                let v1 = (0..d).map(|_| Scalar::<E>(E::Fr::rand(rng))).collect::<Vec<_>>();
+                let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
+                let mut v2 = EvaluationDomain::from_coeffs(v1.coeffs.clone()).unwrap();
+
+                for log_cpus in 0..min(log_d, 3) {
+                    parallel_fft(&mut v1.coeffs, &v1.omega, log_d, log_cpus);
+                    serial_fft(&mut v2.coeffs, &v2.omega, log_d);
+
+                    assert!(v1.coeffs == v2.coeffs);
+                }
+            }
+        }
+    }
+
+    let rng = &mut rand::thread_rng();
+
+    test_consistency::<Bls12, _>(rng);
+}
diff --git a/src/groth16/generator.rs b/src/groth16/generator.rs
new file mode 100644
index 0000000..f7960bb
--- /dev/null
+++ b/src/groth16/generator.rs
@@ -0,0 +1,448 @@
+use pairing::*;
+use pairing::wnaf::*;
+use ::{
+    Input,
+    Error,
+    LinearCombination,
+    Index,
+    Circuit,
+    Variable,
+    ConstraintSystem,
+    PublicConstraintSystem
+};
+use super::{VerifyingKey, Parameters};
+use domain::{Scalar, EvaluationDomain};
+use rand::Rng;
+use multicore;
+use std::sync::Arc;
+
+pub fn generate_random_parameters<E, C, R>(
+    circuit: C,
+    rng: &mut R
+) -> Result<Parameters<E>, Error>
+    where E: Engine, C: Circuit<E>, R: Rng
+{
+    let g1 = rng.gen();
+    let g2 = rng.gen();
+    let alpha = rng.gen();
+    let beta = rng.gen();
+    let gamma = rng.gen();
+    let delta = rng.gen();
+    let tau = rng.gen();
+
+    generate_parameters::<E, C>(
+        circuit,
+        g1,
+        g2,
+        alpha,
+        beta,
+        gamma,
+        delta,
+        tau
+    )
+}
+
+/// Create parameters for a circuit, given some trapdoors.
+pub fn generate_parameters<E, C>(
+    circuit: C,
+    g1: E::G1,
+    g2: E::G2,
+    alpha: E::Fr,
+    beta: E::Fr,
+    gamma: E::Fr,
+    delta: E::Fr,
+    tau: E::Fr
+) -> Result<Parameters<E>, Error>
+    where E: Engine, C: Circuit<E>
+{
+    // This is our assembly structure that we'll use to synthesize the
+    // circuit into a QAP.
+    struct KeypairAssembly<E: Engine> {
+        num_inputs: usize,
+        num_aux: usize,
+        num_constraints: usize,
+        at_inputs: Vec<Vec<(E::Fr, usize)>>,
+        bt_inputs: Vec<Vec<(E::Fr, usize)>>,
+        ct_inputs: Vec<Vec<(E::Fr, usize)>>,
+        at_aux: Vec<Vec<(E::Fr, usize)>>,
+        bt_aux: Vec<Vec<(E::Fr, usize)>>,
+        ct_aux: Vec<Vec<(E::Fr, usize)>>
+    }
+
+    impl<E: Engine> PublicConstraintSystem<E> for KeypairAssembly<E> {
+        fn alloc_input<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, f: F) -> Result<Variable, Error> {
+            // In this context, we don't have an assignment.
+            let _ = f();
+
+            let index = self.num_inputs;
+            self.num_inputs += 1;
+
+            self.at_inputs.push(vec![]);
+            self.bt_inputs.push(vec![]);
+            self.ct_inputs.push(vec![]);
+
+            Ok(Variable(Index::Input(index)))
+        }
+    }
+
+    impl<E: Engine> ConstraintSystem<E> for KeypairAssembly<E> {
+        fn alloc<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, f: F) -> Result<Variable, Error> {
+            // In this context, we don't have an assignment.
+            let _ = f();
+
+            let index = self.num_aux;
+            self.num_aux += 1;
+
+            self.at_aux.push(vec![]);
+            self.bt_aux.push(vec![]);
+            self.ct_aux.push(vec![]);
+
+            Ok(Variable(Index::Aux(index)))
+        }
+
+        fn enforce(
+            &mut self,
+            a: LinearCombination<E>,
+            b: LinearCombination<E>,
+            c: LinearCombination<E>
+        )
+        {
+            fn qap_eval<E: Engine>(
+                l: LinearCombination<E>,
+                inputs: &mut [Vec<(E::Fr, usize)>],
+                aux: &mut [Vec<(E::Fr, usize)>],
+                this_constraint: usize
+            )
+            {
+                for (index, coeff) in l.0 {
+                    match index {
+                        Index::Input(id) => inputs[id].push((coeff, this_constraint)),
+                        Index::Aux(id) => aux[id].push((coeff, this_constraint))
+                    }
+                }
+            }
+
+            qap_eval(a, &mut self.at_inputs, &mut self.at_aux, self.num_constraints);
+            qap_eval(b, &mut self.bt_inputs, &mut self.bt_aux, self.num_constraints);
+            qap_eval(c, &mut self.ct_inputs, &mut self.ct_aux, self.num_constraints);
+
+            self.num_constraints += 1;
+        }
+    }
+
+    let mut assembly = KeypairAssembly {
+        num_inputs: 0,
+        num_aux: 0,
+        num_constraints: 0,
+        at_inputs: vec![],
+        bt_inputs: vec![],
+        ct_inputs: vec![],
+        at_aux: vec![],
+        bt_aux: vec![],
+        ct_aux: vec![]
+    };
+
+    // Allocate the "one" input variable
+    assembly.alloc_input(|| Ok(E::Fr::one()))?;
+
+    // Synthesize the circuit.
+    circuit.synthesize(&mut assembly)?.synthesize(&mut assembly)?;
+
+    // Input consistency constraints: x * 0 = 0
+    for i in 0..assembly.num_inputs {
+        assembly.enforce(LinearCombination::zero() + Variable(Index::Input(i)),
+                         LinearCombination::zero(),
+                         LinearCombination::zero());
+    }
+
+    // Ensure that all auxillary variables are constrained
+    for i in 0..assembly.num_aux {
+        if assembly.at_aux[i].len() == 0 &&
+           assembly.bt_aux[i].len() == 0 &&
+           assembly.ct_aux[i].len() == 0
+        {
+            return Err(Error::UnconstrainedVariable(Variable(Index::Aux(i))));
+        }
+    }
+
+    // Create evaluation domain for the QAP
+    let powers_of_tau = vec![Scalar::<E>(E::Fr::zero()); assembly.num_constraints];
+    let mut powers_of_tau = EvaluationDomain::from_coeffs(powers_of_tau)?;
+
+    // Compute G1 window table
+    let mut g1_table = vec![];
+    let g1_table_size = E::G1::recommended_wnaf_for_num_scalars(
+        // H query
+        (powers_of_tau.as_ref().len() - 1)
+        // IC/L queries
+        + assembly.num_inputs + assembly.num_aux
+        // A query
+        + assembly.num_inputs + assembly.num_aux
+        // B query
+        + assembly.num_inputs + assembly.num_aux
+    );
+    wnaf_table(&mut g1_table, g1, g1_table_size);
+
+    // Compute G2 window table
+    let mut g2_table = vec![];
+    let g2_table_size = E::G2::recommended_wnaf_for_num_scalars(
+        // B query
+        assembly.num_inputs + assembly.num_aux
+    );
+    wnaf_table(&mut g2_table, g2, g2_table_size);
+
+    let gamma_inverse = gamma.inverse().ok_or(Error::UnexpectedIdentity)?;
+    let delta_inverse = delta.inverse().ok_or(Error::UnexpectedIdentity)?;
+
+    // Compute the H query
+    let mut h = vec![E::G1::zero(); powers_of_tau.as_ref().len() - 1];
+    {
+        // Compute the powers of tau
+        {
+            let powers_of_tau = powers_of_tau.as_mut();
+            multicore::scope(powers_of_tau.len(), |scope, chunk| {
+                for (i, powers_of_tau) in powers_of_tau.chunks_mut(chunk).enumerate()
+                {
+                    scope.spawn(move || {
+                        let mut current_tau_power = tau.pow(&[(i*chunk) as u64]);
+
+                        for p in powers_of_tau {
+                            p.0 = current_tau_power;
+                            current_tau_power.mul_assign(&tau);
+                        }
+                    });
+                }
+            });
+        }
+
+        // coeff = t(x) / delta
+        let mut coeff = powers_of_tau.z(&tau);
+        coeff.mul_assign(&delta_inverse);
+
+        // Compute the H query with multiple threads
+        multicore::scope(h.len(), |scope, chunk| {
+            for (h, p) in h.chunks_mut(chunk).zip(powers_of_tau.as_ref().chunks(chunk))
+            {
+                let g1_table = &g1_table;
+
+                scope.spawn(move || {
+                    // Create wNAF form storage location for this thread
+                    let mut wnaf = vec![];
+
+                    // Set values of the H query to g1^{(tau^i * t(tau)) / delta}
+                    for (h, p) in h.iter_mut().zip(p.iter())
+                    {
+                        // Compute final exponent
+                        let mut exp = p.0;
+                        exp.mul_assign(&coeff);
+
+                        // Compute wNAF form of exponent
+                        wnaf_form(&mut wnaf, exp.into_repr(), g1_table_size);
+
+                        // Exponentiate
+                        *h = wnaf_exp(g1_table, &wnaf);
+                    }
+
+                    // Batch normalize
+                    E::G1::batch_normalization(h);
+                });
+            }
+        });
+    }
+
+    // Use inverse FFT to convert powers of tau to Lagrange coefficients
+    powers_of_tau.ifft();
+    let powers_of_tau = powers_of_tau.into_coeffs();
+
+    let mut a = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
+    let mut b_g1 = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
+    let mut b_g2 = vec![E::G2::zero(); assembly.num_inputs + assembly.num_aux];
+    let mut ic = vec![E::G1::zero(); assembly.num_inputs];
+    let mut l = vec![E::G1::zero(); assembly.num_aux];
+
+    fn eval<E: Engine>(
+        // wNAF window tables
+        g1_table: &[E::G1],
+        g1_table_size: usize,
+        g2_table: &[E::G2],
+        g2_table_size: usize,
+
+        // Lagrange coefficients for tau
+        powers_of_tau: &[Scalar<E>],
+
+        // QAP polynomials
+        at: &[Vec<(E::Fr, usize)>],
+        bt: &[Vec<(E::Fr, usize)>],
+        ct: &[Vec<(E::Fr, usize)>],
+
+        // Resulting evaluated QAP polynomials
+        a: &mut [E::G1],
+        b_g1: &mut [E::G1],
+        b_g2: &mut [E::G2],
+        ext: &mut [E::G1],
+
+        // Inverse coefficient for ext elements
+        inv: &E::Fr,
+
+        // Trapdoors
+        alpha: &E::Fr,
+        beta: &E::Fr
+    )
+    {
+        // Sanity check
+        assert_eq!(a.len(), at.len());
+        assert_eq!(a.len(), bt.len());
+        assert_eq!(a.len(), ct.len());
+        assert_eq!(a.len(), b_g1.len());
+        assert_eq!(a.len(), b_g2.len());
+        assert_eq!(a.len(), ext.len());
+
+        // Evaluate polynomials in multiple threads
+        multicore::scope(a.len(), |scope, chunk| {
+            for ((((((a, b_g1), b_g2), ext), at), bt), ct) in a.chunks_mut(chunk)
+                                                               .zip(b_g1.chunks_mut(chunk))
+                                                               .zip(b_g2.chunks_mut(chunk))
+                                                               .zip(ext.chunks_mut(chunk))
+                                                               .zip(at.chunks(chunk))
+                                                               .zip(bt.chunks(chunk))
+                                                               .zip(ct.chunks(chunk))
+            {
+                scope.spawn(move || {
+                    // Create wNAF form storage location for this thread
+                    let mut wnaf = vec![];
+
+                    for ((((((a, b_g1), b_g2), ext), at), bt), ct) in a.iter_mut()
+                                                                       .zip(b_g1.iter_mut())
+                                                                       .zip(b_g2.iter_mut())
+                                                                       .zip(ext.iter_mut())
+                                                                       .zip(at.iter())
+                                                                       .zip(bt.iter())
+                                                                       .zip(ct.iter())
+                    {
+                        fn eval_at_tau<E: Engine>(
+                            powers_of_tau: &[Scalar<E>],
+                            p: &[(E::Fr, usize)]
+                        ) -> E::Fr
+                        {
+                            let mut acc = E::Fr::zero();
+
+                            for &(ref coeff, index) in p {
+                                let mut n = powers_of_tau[index].0;
+                                n.mul_assign(coeff);
+                                acc.add_assign(&n);
+                            }
+
+                            acc
+                        }
+
+                        // Evaluate QAP polynomials at tau
+                        let mut at = eval_at_tau(powers_of_tau, at);
+                        let mut bt = eval_at_tau(powers_of_tau, bt);
+                        let ct = eval_at_tau(powers_of_tau, ct);
+
+                        // Compute A query (in G1)
+                        if !at.is_zero() {
+                            wnaf_form(&mut wnaf, at.into_repr(), g1_table_size);
+                            *a = wnaf_exp(&g1_table, &wnaf);
+                        }
+
+                        // Compute B query (in G1/G2)
+                        if !bt.is_zero() {
+                            // Normalize the field element once
+                            let bt_repr = bt.into_repr();
+                            wnaf_form(&mut wnaf, bt_repr, g1_table_size);
+                            *b_g1 = wnaf_exp(&g1_table, &wnaf);
+
+                            // G1 window table might use the same window size
+                            // as the G2 window table, so we wouldn't need to
+                            // recompute the wNAF form of the exponent.
+                            if g1_table_size != g2_table_size {
+                                wnaf_form(&mut wnaf, bt_repr, g2_table_size);
+                            }
+                            *b_g2 = wnaf_exp(&g2_table, &wnaf);
+                        }
+
+                        at.mul_assign(&beta);
+                        bt.mul_assign(&alpha);
+
+                        let mut e = at;
+                        e.add_assign(&bt);
+                        e.add_assign(&ct);
+                        e.mul_assign(inv);
+
+                        wnaf_form(&mut wnaf, e.into_repr(), g1_table_size);
+                        *ext = wnaf_exp(&g1_table, &wnaf);
+                    }
+
+                    // Batch normalize
+                    E::G1::batch_normalization(a);
+                    E::G1::batch_normalization(b_g1);
+                    E::G2::batch_normalization(b_g2);
+                    E::G1::batch_normalization(ext);
+                });
+            }
+        });
+    }
+
+    // Evaluate for inputs.
+    eval(
+        &g1_table,
+        g1_table_size,
+        &g2_table,
+        g2_table_size,
+        &powers_of_tau,
+        &assembly.at_inputs,
+        &assembly.bt_inputs,
+        &assembly.ct_inputs,
+        &mut a[0..assembly.num_inputs],
+        &mut b_g1[0..assembly.num_inputs],
+        &mut b_g2[0..assembly.num_inputs],
+        &mut ic,
+        &gamma_inverse,
+        &alpha,
+        &beta
+    );
+
+    // Evaluate for auxillary variables.
+    eval(
+        &g1_table,
+        g1_table_size,
+        &g2_table,
+        g2_table_size,
+        &powers_of_tau,
+        &assembly.at_aux,
+        &assembly.bt_aux,
+        &assembly.ct_aux,
+        &mut a[assembly.num_inputs..],
+        &mut b_g1[assembly.num_inputs..],
+        &mut b_g2[assembly.num_inputs..],
+        &mut l,
+        &delta_inverse,
+        &alpha,
+        &beta
+    );
+
+    let g1 = g1.into_affine();
+    let g2 = g2.into_affine();
+
+    let vk = VerifyingKey::<E> {
+        alpha_g1: g1.mul(alpha).into_affine(),
+        beta_g1: g1.mul(beta).into_affine(),
+        beta_g2: g2.mul(beta).into_affine(),
+        gamma_g2: g2.mul(gamma).into_affine(),
+        delta_g1: g1.mul(delta).into_affine(),
+        delta_g2: g2.mul(delta).into_affine(),
+        ic: ic.into_iter().map(|e| e.into_affine()).collect()
+    };
+
+    Ok(Parameters {
+        vk: vk,
+        h: Arc::new(h.into_iter().map(|e| e.into_affine()).collect()),
+        l: Arc::new(l.into_iter().map(|e| e.into_affine()).collect()),
+
+        // Filter points at infinity away from A/B queries
+        a: Arc::new(a.into_iter().filter(|e| !e.is_zero()).map(|e| e.into_affine()).collect()),
+        b_g1: Arc::new(b_g1.into_iter().filter(|e| !e.is_zero()).map(|e| e.into_affine()).collect()),
+        b_g2: Arc::new(b_g2.into_iter().filter(|e| !e.is_zero()).map(|e| e.into_affine()).collect())
+    })
+}
diff --git a/src/groth16/mod.rs b/src/groth16/mod.rs
new file mode 100644
index 0000000..bf344e3
--- /dev/null
+++ b/src/groth16/mod.rs
@@ -0,0 +1,424 @@
+use pairing::*;
+use std::sync::Arc;
+
+mod generator;
+pub use self::generator::*;
+mod prover;
+pub use self::prover::*;
+mod verifier;
+pub use self::verifier::*;
+
+use ::Error;
+use std::io::{self, Write, Read};
+use multiexp::{Source, SourceBuilder};
+
+pub struct Proof<E: Engine> {
+    a: E::G1Affine,
+    b: E::G2Affine,
+    c: E::G1Affine
+}
+
+pub struct PreparedVerifyingKey<E: Engine> {
+    alpha_g1_beta_g2: E::Fqk,
+    neg_gamma_g2: <E::G2Affine as CurveAffine>::Prepared,
+    neg_delta_g2: <E::G2Affine as CurveAffine>::Prepared,
+    ic: Vec<E::G1Affine>
+}
+
+pub struct VerifyingKey<E: Engine> {
+    // alpha in g1 for verifying and for creating A/C elements of
+    // proof. Never the point at infinity.
+    alpha_g1: E::G1Affine,
+
+    // beta in g1 and g2 for verifying and for creating B/C elements
+    // of proof. Never the point at infinity.
+    beta_g1: E::G1Affine,
+    beta_g2: E::G2Affine,
+
+    // gamma in g2 for verifying. Never the point at infinity.
+    gamma_g2: E::G2Affine,
+
+    // delta in g1/g2 for verifying and proving, essentially the magic
+    // trapdoor that forces the prover to evaluate the C element of the
+    // proof with only components from the CRS. Never the point at
+    // infinity.
+    delta_g1: E::G1Affine,
+    delta_g2: E::G2Affine,
+
+    // Elements of the form (beta * u_i(tau) + alpha v_i(tau) + w_i(tau)) / gamma
+    // for all public inputs. Because all public inputs have a "soundness
+    // of input consistency" constraint, this is the same size as the
+    // number of inputs, and never contains points at infinity.
+    ic: Vec<E::G1Affine>
+}
+
+impl<E: Engine> Clone for VerifyingKey<E> {
+    fn clone(&self) -> VerifyingKey<E> {
+        VerifyingKey {
+            alpha_g1: self.alpha_g1.clone(),
+            beta_g1: self.beta_g1.clone(),
+            beta_g2: self.beta_g2.clone(),
+            gamma_g2: self.gamma_g2.clone(),
+            delta_g1: self.delta_g1.clone(),
+            delta_g2: self.delta_g2.clone(),
+            ic: self.ic.clone()
+        }
+    } 
+}
+
+impl<E: Engine> PartialEq for VerifyingKey<E> {
+    fn eq(&self, other: &VerifyingKey<E>) -> bool {
+        self.alpha_g1 == other.alpha_g1 &&
+        self.beta_g1 == other.beta_g1 &&
+        self.beta_g2 == other.beta_g2 &&
+        self.gamma_g2 == other.gamma_g2 &&
+        self.delta_g1 == other.delta_g1 &&
+        self.delta_g2 == other.delta_g2 &&
+        self.ic == other.ic
+    }
+}
+
+fn read_nonzero<R: Read, G: CurveAffine>(reader: &mut R) -> Result<G, Error> {
+    let mut repr = G::Uncompressed::empty();
+    reader.read_exact(repr.as_mut())?;
+
+    let affine = repr.into_affine_unchecked(); // TODO
+
+    match affine {
+        Ok(affine) => {
+            if affine.is_zero() {
+                Err(Error::UnexpectedIdentity)
+            } else {
+                Ok(affine)
+            }
+        },
+        Err(e) => Err(io::Error::new(io::ErrorKind::InvalidData, e).into())
+    }
+}
+
+impl<E: Engine> VerifyingKey<E> {
+    fn size(num_ic: usize) -> usize {
+        let mut acc = 0;
+        acc += <E::G1Affine as CurveAffine>::Uncompressed::size(); // alpha_g1
+        acc += <E::G1Affine as CurveAffine>::Uncompressed::size(); // beta_g1
+        acc += <E::G1Affine as CurveAffine>::Uncompressed::size(); // delta_g1
+        acc += <E::G1Affine as CurveAffine>::Uncompressed::size() * num_ic; // IC
+        acc += <E::G2Affine as CurveAffine>::Uncompressed::size(); // beta_g2
+        acc += <E::G2Affine as CurveAffine>::Uncompressed::size(); // gamma_g2
+        acc += <E::G2Affine as CurveAffine>::Uncompressed::size(); // delta_g2
+
+        acc
+    }
+
+    pub fn write<W: Write>(&self, writer: &mut W) -> Result<(), io::Error> {
+        writer.write_all(self.alpha_g1.into_uncompressed().as_ref())?;
+        writer.write_all(self.beta_g1.into_uncompressed().as_ref())?;
+        writer.write_all(self.beta_g2.into_uncompressed().as_ref())?;
+        writer.write_all(self.gamma_g2.into_uncompressed().as_ref())?;
+        writer.write_all(self.delta_g1.into_uncompressed().as_ref())?;
+        writer.write_all(self.delta_g2.into_uncompressed().as_ref())?;
+        for ic in &self.ic {
+            writer.write_all(ic.into_uncompressed().as_ref())?;
+        }
+
+        Ok(())
+    }
+
+    pub fn read<R: Read>(reader: &mut R, num_ic: usize) -> Result<VerifyingKey<E>, Error> {
+        let alpha_g1 = read_nonzero(reader)?;
+        let beta_g1 = read_nonzero(reader)?;
+        let beta_g2 = read_nonzero(reader)?;
+        let gamma_g2 = read_nonzero(reader)?;
+        let delta_g1 = read_nonzero(reader)?;
+        let delta_g2 = read_nonzero(reader)?;
+
+        let mut ic = vec![];
+        for _ in 0..num_ic {
+            ic.push(read_nonzero(reader)?);
+        }
+
+        Ok(VerifyingKey {
+            alpha_g1: alpha_g1,
+            beta_g1: beta_g1,
+            beta_g2: beta_g2,
+            gamma_g2: gamma_g2,
+            delta_g1: delta_g1,
+            delta_g2: delta_g2,
+            ic: ic
+        })
+    }
+}
+
+pub struct Parameters<E: Engine> {
+    pub vk: VerifyingKey<E>,
+
+    // Elements of the form ((tau^i * t(tau)) / delta) for i between 0 and 
+    // m-2 inclusive. Never contains points at infinity.
+    h: Arc<Vec<E::G1Affine>>,
+
+    // Elements of the form (beta * u_i(tau) + alpha v_i(tau) + w_i(tau)) / delta
+    // for all auxillary inputs. Variables can never be unconstrained, so this
+    // never contains points at infinity.
+    l: Arc<Vec<E::G1Affine>>,
+
+    // QAP "A" polynomials evaluated at tau in the Lagrange basis. Never contains
+    // points at infinity: polynomials that evaluate to zero are omitted from
+    // the CRS and the prover can deterministically skip their evaluation.
+    a: Arc<Vec<E::G1Affine>>,
+
+    // QAP "B" polynomials evaluated at tau in the Lagrange basis. Needed in
+    // G1 and G2 for C/B queries, respectively. Never contains points at
+    // infinity for the same reason as the "A" polynomials.
+    b_g1: Arc<Vec<E::G1Affine>>,
+    b_g2: Arc<Vec<E::G2Affine>>
+}
+
+impl<E: Engine> Parameters<E> {
+    pub fn write<W: Write>(&self, writer: &mut W) -> Result<(), io::Error> {
+        self.vk.write(writer)?;
+
+        for e in &*self.h {
+            writer.write_all(e.into_uncompressed().as_ref())?;
+        }
+
+        for e in &*self.l {
+            writer.write_all(e.into_uncompressed().as_ref())?;
+        }
+
+        for e in &*self.a {
+            writer.write_all(e.into_uncompressed().as_ref())?;
+        }
+
+        for e in &*self.b_g1 {
+            writer.write_all(e.into_uncompressed().as_ref())?;
+        }
+
+        for e in &*self.b_g2 {
+            writer.write_all(e.into_uncompressed().as_ref())?;
+        }
+
+        Ok(())
+    }
+}
+
+pub trait ParameterSource<E: Engine> {
+    type G1Builder: SourceBuilder<E::G1Affine>;
+    type G2Builder: SourceBuilder<E::G2Affine>;
+
+    fn get_vk(&mut self, num_ic: usize) -> Result<VerifyingKey<E>, Error>;
+    fn get_h(&mut self, num_h: usize) -> Result<Self::G1Builder, Error>;
+    fn get_l(&mut self, num_l: usize) -> Result<Self::G1Builder, Error>;
+    fn get_a(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error>;
+    fn get_b_g1(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error>;
+    fn get_b_g2(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G2Builder, Self::G2Builder), Error>;
+}
+
+impl<'a, E: Engine> ParameterSource<E> for &'a Parameters<E> {
+    type G1Builder = (Arc<Vec<E::G1Affine>>, usize);
+    type G2Builder = (Arc<Vec<E::G2Affine>>, usize);
+
+    fn get_vk(&mut self, num_ic: usize) -> Result<VerifyingKey<E>, Error> {
+        assert_eq!(self.vk.ic.len(), num_ic);
+
+        Ok(self.vk.clone())
+    }
+
+    fn get_h(&mut self, num_h: usize) -> Result<Self::G1Builder, Error> {
+        assert_eq!(self.h.len(), num_h);
+
+        Ok((self.h.clone(), 0))
+    }
+
+    fn get_l(&mut self, num_l: usize) -> Result<Self::G1Builder, Error> {
+        assert_eq!(self.l.len(), num_l);
+
+        Ok((self.l.clone(), 0))
+    }
+
+    fn get_a(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error> {
+        assert_eq!(self.a.len(), num_inputs + num_aux);
+
+        Ok(((self.a.clone(), 0), (self.a.clone(), num_inputs)))
+    }
+
+    fn get_b_g1(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error> {
+        assert_eq!(self.b_g1.len(), num_inputs + num_aux);
+
+        Ok(((self.b_g1.clone(), 0), (self.b_g1.clone(), num_inputs)))
+    }
+
+    fn get_b_g2(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G2Builder, Self::G2Builder), Error> {
+        assert_eq!(self.b_g2.len(), num_inputs + num_aux);
+
+        Ok(((self.b_g2.clone(), 0), (self.b_g2.clone(), num_inputs)))
+    }
+}
+
+use std::fs::File;
+use std::io::{Seek, SeekFrom};
+
+pub struct ProverStream {
+    path: String,
+    cursor: u64,
+    fh: Option<File>
+}
+
+impl Clone for ProverStream {
+    fn clone(&self) -> ProverStream {
+        ProverStream {
+            path: self.path.clone(),
+            cursor: self.cursor,
+            fh: None
+        }
+    }
+}
+
+impl ProverStream {
+    pub fn new(path: &str) -> Result<ProverStream, io::Error> {
+        Ok(ProverStream {
+            path: path.to_string(),
+            cursor: 0,
+            fh: None
+        })
+    }
+
+    fn open_if_needed(&mut self) -> Result<(), Error> {
+        if self.fh.is_none() {
+            let mut fh = File::open(&self.path)?;
+            fh.seek(SeekFrom::Start(self.cursor))?;
+
+            self.fh = Some(fh);
+        }
+
+        Ok(())
+    }
+}
+
+impl<G: CurveAffine> Source<G> for ProverStream {
+    fn add_assign_mixed(&mut self, to: &mut <G as CurveAffine>::Projective) -> Result<(), Error> {
+        self.open_if_needed()?;
+
+        let r: G = read_nonzero(self.fh.as_mut().unwrap())?;
+
+        self.cursor += G::Uncompressed::size() as u64;
+        
+        to.add_assign_mixed(&r);
+
+        Ok(())
+    }
+    fn skip(&mut self, amt: usize) -> Result<(), Error> {
+        self.open_if_needed()?;
+        
+        let size_to_skip = amt * G::Uncompressed::size();
+
+        self.cursor += size_to_skip as u64;
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(size_to_skip as i64))?;
+
+        Ok(())
+    }
+}
+
+impl<G: CurveAffine> SourceBuilder<G> for ProverStream {
+    type Source = Self;
+
+    fn new(self) -> Self::Source {
+        self
+    }
+}
+
+impl<E: Engine> ParameterSource<E> for ProverStream {
+    type G1Builder = ProverStream;
+    type G2Builder = ProverStream;
+
+    fn get_vk(&mut self, num_ic: usize) -> Result<VerifyingKey<E>, Error> {
+        self.open_if_needed()?;
+
+        let vk = VerifyingKey::read(self.fh.as_mut().unwrap(), num_ic)?;
+
+        self.cursor += VerifyingKey::<E>::size(num_ic) as u64;
+
+        Ok(vk)
+    }
+    fn get_h(&mut self, num_h: usize) -> Result<Self::G1Builder, Error> {
+        self.open_if_needed()?;
+
+        let res = self.clone();
+
+        let amount_to_seek = num_h * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        Ok(res)
+    }
+    fn get_l(&mut self, num_l: usize) -> Result<Self::G1Builder, Error> {
+        self.open_if_needed()?;
+
+        let res = self.clone();
+
+        let amount_to_seek = num_l * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        Ok(res)
+    }
+    fn get_a(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error> {
+        self.open_if_needed()?;
+
+        let res1 = self.clone();
+
+        let amount_to_seek = num_inputs * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        let res2 = self.clone();
+
+        let amount_to_seek = num_aux * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        Ok((res1, res2))
+    }
+    fn get_b_g1(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G1Builder, Self::G1Builder), Error> {
+        self.open_if_needed()?;
+
+        let res1 = self.clone();
+
+        let amount_to_seek = num_inputs * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        let res2 = self.clone();
+
+        let amount_to_seek = num_aux * <E::G1Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        Ok((res1, res2))
+    }
+    fn get_b_g2(&mut self, num_inputs: usize, num_aux: usize) -> Result<(Self::G2Builder, Self::G2Builder), Error> {
+        self.open_if_needed()?;
+
+        let res1 = self.clone();
+
+        let amount_to_seek = num_inputs * <E::G2Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        let res2 = self.clone();
+
+        let amount_to_seek = num_aux * <E::G2Affine as CurveAffine>::Uncompressed::size();
+
+        self.fh.as_mut().unwrap().seek(SeekFrom::Current(amount_to_seek as i64))?;
+        self.cursor += amount_to_seek as u64;
+
+        Ok((res1, res2))
+    }
+}
diff --git a/src/groth16/prover.rs b/src/groth16/prover.rs
new file mode 100644
index 0000000..ddc7bd7
--- /dev/null
+++ b/src/groth16/prover.rs
@@ -0,0 +1,205 @@
+use pairing::*;
+use domain::{Scalar, EvaluationDomain};
+use ::{
+    ConstraintSystem,
+    PublicConstraintSystem,
+    Circuit,
+    Input,
+    Index,
+    Error,
+    Variable,
+    LinearCombination
+};
+use multiexp::*;
+use super::{ParameterSource, Proof};
+use rand::Rng;
+use std::sync::Arc;
+use futures::Future;
+use futures_cpupool::CpuPool;
+
+pub fn create_random_proof<E, C, R, P: ParameterSource<E>>(
+    circuit: C,
+    params: P,
+    rng: &mut R
+) -> Result<Proof<E>, Error>
+    where E: Engine, C: Circuit<E>, R: Rng
+{
+    let r = rng.gen();
+    let s = rng.gen();
+
+    create_proof::<E, C, P>(circuit, params, r, s)
+}
+
+pub fn create_proof<E, C, P: ParameterSource<E>>(
+    circuit: C,
+    mut params: P,
+    r: E::Fr,
+    s: E::Fr
+) -> Result<Proof<E>, Error>
+    where E: Engine, C: Circuit<E>
+{
+    struct ProvingAssignment<E: Engine> {
+        // Density of queries
+        a_aux_density: DensityTracker,
+        b_input_density: DensityTracker,
+        b_aux_density: DensityTracker,
+
+        // Evaluations of A, B, C polynomials
+        a: Vec<Scalar<E>>,
+        b: Vec<Scalar<E>>,
+        c: Vec<Scalar<E>>,
+
+        // Assignments of variables
+        input_assignment: Vec<E::Fr>,
+        aux_assignment: Vec<E::Fr>
+    }
+
+    impl<E: Engine> PublicConstraintSystem<E> for ProvingAssignment<E> {
+        fn alloc_input<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, value: F) -> Result<Variable, Error> {
+            self.input_assignment.push(value()?);
+            self.b_input_density.add_element();
+
+            Ok(Variable(Index::Input(self.input_assignment.len() - 1)))
+        }
+    }
+
+    impl<E: Engine> ConstraintSystem<E> for ProvingAssignment<E> {
+        fn alloc<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, value: F) -> Result<Variable, Error> {
+            self.aux_assignment.push(value()?);
+            self.a_aux_density.add_element();
+            self.b_aux_density.add_element();
+
+            Ok(Variable(Index::Aux(self.aux_assignment.len() - 1)))
+        }
+
+        fn enforce(
+            &mut self,
+            a: LinearCombination<E>,
+            b: LinearCombination<E>,
+            c: LinearCombination<E>
+        )
+        {
+            self.a.push(Scalar(a.eval(None, Some(&mut self.a_aux_density), &self.input_assignment, &self.aux_assignment)));
+            self.b.push(Scalar(b.eval(Some(&mut self.b_input_density), Some(&mut self.b_aux_density), &self.input_assignment, &self.aux_assignment)));
+            self.c.push(Scalar(c.eval(None, None, &self.input_assignment, &self.aux_assignment)));
+        }
+    }
+
+    let mut prover = ProvingAssignment {
+        a_aux_density: DensityTracker::new(),
+        b_input_density: DensityTracker::new(),
+        b_aux_density: DensityTracker::new(),
+        a: vec![],
+        b: vec![],
+        c: vec![],
+        input_assignment: vec![],
+        aux_assignment: vec![]
+    };
+
+    prover.alloc_input(|| Ok(E::Fr::one()))?;
+
+    circuit.synthesize(&mut prover)?.synthesize(&mut prover)?;
+
+    // Input consistency constraints: x * 0 = 0
+    for i in 0..prover.input_assignment.len() {
+        prover.enforce(LinearCombination::zero() + Variable(Index::Input(i)),
+                       LinearCombination::zero(),
+                       LinearCombination::zero());
+    }
+
+    let cpupool = CpuPool::new_num_cpus();
+
+    let vk = params.get_vk(prover.input_assignment.len())?;
+
+    let h = {
+        let mut a = EvaluationDomain::from_coeffs(prover.a)?;
+        let mut b = EvaluationDomain::from_coeffs(prover.b)?;
+        let mut c = EvaluationDomain::from_coeffs(prover.c)?;
+        a.ifft();
+        a.coset_fft();
+        b.ifft();
+        b.coset_fft();
+        c.ifft();
+        c.coset_fft();
+
+        a.mul_assign(&b);
+        drop(b);
+        a.sub_assign(&c);
+        drop(c);
+        a.divide_by_z_on_coset();
+        a.icoset_fft();
+        let mut a = a.into_coeffs();
+        let a_len = a.len() - 1;
+        a.truncate(a_len);
+        // TODO: parallelize if it's even helpful
+        let a = Arc::new(a.into_iter().map(|s| s.0.into_repr()).collect::<Vec<_>>());
+
+        multiexp(&cpupool, params.get_h(a.len())?, FullDensity, a)
+    };
+
+    // TODO: parallelize if it's even helpful
+    let input_assignment = Arc::new(prover.input_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
+    let aux_assignment = Arc::new(prover.aux_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
+
+    let l = multiexp(&cpupool, params.get_l(aux_assignment.len())?, FullDensity, aux_assignment.clone());
+
+    let a_aux_density_total = prover.a_aux_density.get_total_density();
+
+    let (a_inputs_source, a_aux_source) = params.get_a(input_assignment.len(), a_aux_density_total)?;
+
+    let a_inputs = multiexp(&cpupool, a_inputs_source, FullDensity, input_assignment.clone());
+    let a_aux = multiexp(&cpupool, a_aux_source, Arc::new(prover.a_aux_density), aux_assignment.clone());
+
+    let b_input_density = Arc::new(prover.b_input_density);
+    let b_input_density_total = b_input_density.get_total_density();
+    let b_aux_density = Arc::new(prover.b_aux_density);
+    let b_aux_density_total = b_aux_density.get_total_density();
+
+    let (b_g1_inputs_source, b_g1_aux_source) = params.get_b_g1(b_input_density_total, b_aux_density_total)?;
+
+    let b_g1_inputs = multiexp(&cpupool, b_g1_inputs_source, b_input_density.clone(), input_assignment.clone());
+    let b_g1_aux = multiexp(&cpupool, b_g1_aux_source, b_aux_density.clone(), aux_assignment.clone());
+
+    let (b_g2_inputs_source, b_g2_aux_source) = params.get_b_g2(b_input_density_total, b_aux_density_total)?;
+    
+    let b_g2_inputs = multiexp(&cpupool, b_g2_inputs_source, b_input_density, input_assignment.clone());
+    let b_g2_aux = multiexp(&cpupool, b_g2_aux_source, b_aux_density, aux_assignment);
+
+    drop(input_assignment);
+
+    let mut g_a = vk.delta_g1.mul(r);
+    g_a.add_assign_mixed(&vk.alpha_g1);
+    let mut g_b = vk.delta_g2.mul(s);
+    g_b.add_assign_mixed(&vk.beta_g2);
+    let mut g_c;
+    {
+        let mut rs = r;
+        rs.mul_assign(&s);
+
+        g_c = vk.delta_g1.mul(rs);
+        g_c.add_assign(&vk.alpha_g1.mul(s));
+        g_c.add_assign(&vk.beta_g1.mul(r));
+    }
+    let mut a_answer = a_inputs.wait()?;
+    a_answer.add_assign(&a_aux.wait()?);
+    g_a.add_assign(&a_answer);
+    a_answer.mul_assign(s);
+    g_c.add_assign(&a_answer);
+
+    let mut b1_answer = b_g1_inputs.wait()?;
+    b1_answer.add_assign(&b_g1_aux.wait()?);
+    let mut b2_answer = b_g2_inputs.wait()?;
+    b2_answer.add_assign(&b_g2_aux.wait()?);
+
+    g_b.add_assign(&b2_answer);
+    b1_answer.mul_assign(r);
+    g_c.add_assign(&b1_answer);
+    g_c.add_assign(&h.wait()?);
+    g_c.add_assign(&l.wait()?);
+
+    Ok(Proof {
+        a: g_a.into_affine(),
+        b: g_b.into_affine(),
+        c: g_c.into_affine()
+    })
+}
diff --git a/src/groth16/verifier.rs b/src/groth16/verifier.rs
new file mode 100644
index 0000000..6d23504
--- /dev/null
+++ b/src/groth16/verifier.rs
@@ -0,0 +1,139 @@
+use pairing::*;
+use ::{
+    Input,
+    Error,
+    LinearCombination,
+    Index,
+    Variable,
+    ConstraintSystem,
+    PublicConstraintSystem
+};
+use super::{Proof, VerifyingKey, PreparedVerifyingKey};
+
+/// This is the constraint system synthesizer that is made available to
+/// callers of the verification function when they wish to perform
+/// allocations. In that context, allocation of inputs is not allowed.
+pub struct VerifierInput<'a, E: Engine> {
+    acc: E::G1,
+    ic: &'a [E::G1Affine],
+    insufficient_inputs: bool,
+    num_inputs: usize,
+    num_aux: usize
+}
+
+impl<'a, E: Engine> ConstraintSystem<E> for VerifierInput<'a, E> {
+    fn alloc<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, f: F) -> Result<Variable, Error> {
+        // Run the function for calculating the allocation but ignore the output,
+        // since we don't care about the assignment of auxillary variables during
+        // verification.
+        let _ = f();
+
+        let index = self.num_aux;
+        self.num_aux += 1;
+
+        Ok(Variable(Index::Aux(index)))
+    }
+
+    fn enforce(
+        &mut self,
+        _: LinearCombination<E>,
+        _: LinearCombination<E>,
+        _: LinearCombination<E>
+    )
+    {
+        // Do nothing; we don't care about the constraint system
+        // in this context.
+    }
+}
+
+/// This is intended to be a wrapper around VerifierInput that is kept
+/// private and used for input allocation.
+struct InputAllocator<T>(T);
+
+impl<'a, 'b, E: Engine> ConstraintSystem<E> for InputAllocator<&'a mut VerifierInput<'b, E>> {
+    fn alloc<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, value: F) -> Result<Variable, Error> {
+        self.0.alloc(value)
+    }
+
+    fn enforce(
+        &mut self,
+        _: LinearCombination<E>,
+        _: LinearCombination<E>,
+        _: LinearCombination<E>
+    )
+    {
+        // Do nothing; we don't care about the constraint system
+        // in this context.
+    }
+}
+
+impl<'a, 'b, E: Engine> PublicConstraintSystem<E> for InputAllocator<&'a mut VerifierInput<'b, E>> {
+    fn alloc_input<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, value: F) -> Result<Variable, Error> {
+        if self.0.ic.len() == 0 {
+            self.0.insufficient_inputs = true;
+        } else {
+            self.0.acc.add_assign(&self.0.ic[0].mul(value()?));
+            self.0.ic = &self.0.ic[1..];
+        }
+
+        let index = self.0.num_inputs;
+        self.0.num_inputs += 1;
+
+        Ok(Variable(Index::Input(index)))
+    }
+}
+
+pub fn verify_proof<'a, E, C, F>(
+    pvk: &'a PreparedVerifyingKey<E>,
+    proof: &Proof<E>,
+    circuit: F
+) -> Result<bool, Error>
+    where E: Engine, C: Input<E>, F: FnOnce(&mut VerifierInput<'a, E>) -> Result<C, Error>
+{
+    let mut witness = VerifierInput::<E> {
+        acc: pvk.ic[0].into_projective(),
+        ic: &pvk.ic[1..],
+        insufficient_inputs: false,
+        num_inputs: 1,
+        num_aux: 0
+    };
+
+    circuit(&mut witness)?.synthesize(&mut InputAllocator(&mut witness))?;
+
+    if witness.ic.len() != 0 || witness.insufficient_inputs {
+        return Err(Error::MalformedVerifyingKey);
+    }
+
+    // The original verification equation is:
+    // A * B = alpha * beta + inputs * gamma + C * delta
+    // ... however, we rearrange it so that it is:
+    // A * B - inputs * gamma - C * delta = alpha * beta
+    // or equivalently:
+    // A * B + inputs * (-gamma) + C * (-delta) = alpha * beta
+    // which allows us to do a single final exponentiation.
+
+    Ok(E::final_exponentiation(
+        &E::miller_loop([
+            (&proof.a.prepare(), &proof.b.prepare()),
+            (&witness.acc.into_affine().prepare(), &pvk.neg_gamma_g2),
+            (&proof.c.prepare(), &pvk.neg_delta_g2)
+        ].into_iter())
+    ).unwrap() == pvk.alpha_g1_beta_g2)
+}
+
+pub fn prepare_verifying_key<E: Engine>(
+    vk: &VerifyingKey<E>
+) -> PreparedVerifyingKey<E>
+{
+    let mut gamma = vk.gamma_g2;
+    gamma.negate();
+    let mut delta = vk.delta_g2;
+    delta.negate();
+
+    PreparedVerifyingKey {
+        alpha_g1_beta_g2: E::pairing(vk.alpha_g1, vk.beta_g2),
+        neg_gamma_g2: gamma.prepare(),
+        neg_delta_g2: delta.prepare(),
+        ic: vk.ic.clone()
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..1aa7187
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,208 @@
+extern crate pairing;
+extern crate rand;
+extern crate bit_vec;
+extern crate futures;
+extern crate futures_cpupool;
+extern crate num_cpus;
+extern crate crossbeam;
+
+use pairing::{Engine, Field};
+use std::ops::{Add, Sub};
+use std::io;
+
+pub mod multicore;
+pub mod domain;
+pub mod groth16;
+
+pub mod multiexp;
+// TODO: remove this from public API?
+pub use self::multiexp::{DensityTracker, FullDensity, multiexp};
+
+#[derive(Debug)]
+pub enum Error {
+    PolynomialDegreeTooLarge,
+    MalformedVerifyingKey,
+    AssignmentMissing,
+    UnexpectedIdentity,
+    UnconstrainedVariable(Variable),
+    IoError(io::Error)
+}
+
+impl From<io::Error> for Error {
+    fn from(e: io::Error) -> Error {
+        Error::IoError(e)
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Variable(Index);
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+enum Index {
+    Input(usize),
+    Aux(usize)
+}
+
+pub struct LinearCombination<E: Engine>(Vec<(Index, E::Fr)>);
+
+impl<E: Engine> Clone for LinearCombination<E> {
+    fn clone(&self) -> LinearCombination<E> {
+        LinearCombination(self.0.clone())
+    }
+}
+
+impl<E: Engine> LinearCombination<E> {
+    pub fn zero() -> LinearCombination<E> {
+        LinearCombination(vec![])
+    }
+
+    pub fn eval(
+        self,
+        mut input_density: Option<&mut DensityTracker>,
+        mut aux_density: Option<&mut DensityTracker>,
+        input_assignment: &[E::Fr],
+        aux_assignment: &[E::Fr]
+    ) -> E::Fr
+    {
+        let mut acc = E::Fr::zero();
+
+        for (index, coeff) in self.0.into_iter() {
+            let mut tmp;
+
+            match index {
+                Index::Input(i) => {
+                    tmp = input_assignment[i];
+                    if let Some(ref mut v) = input_density {
+                        v.inc(i);
+                    }
+                },
+                Index::Aux(i) => {
+                    tmp = aux_assignment[i];
+                    if let Some(ref mut v) = aux_density {
+                        v.inc(i);
+                    }
+                }
+            }
+
+            if coeff == E::Fr::one() {
+               acc.add_assign(&tmp);
+            } else {
+               tmp.mul_assign(&coeff);
+               acc.add_assign(&tmp);
+            }
+        }
+
+        acc
+    }
+}
+
+impl<E: Engine> Add<Variable> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn add(self, other: Variable) -> LinearCombination<E> {
+        self + (E::Fr::one(), other)
+    }
+}
+
+impl<E: Engine> Sub<Variable> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn sub(self, other: Variable) -> LinearCombination<E> {
+        self - (E::Fr::one(), other)
+    }
+}
+
+impl<E: Engine> Add<(E::Fr, Variable)> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn add(mut self, (coeff, var): (E::Fr, Variable)) -> LinearCombination<E> {
+        let mut must_insert = true;
+
+        for &mut (ref index, ref mut fr) in &mut self.0 {
+            if *index == var.0 {
+                fr.add_assign(&coeff);
+                must_insert = false;
+                break;
+            }
+        }
+
+        if must_insert {
+            self.0.push((var.0, coeff));
+        }
+
+        self
+    }
+}
+
+impl<E: Engine> Sub<(E::Fr, Variable)> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn sub(self, (mut coeff, var): (E::Fr, Variable)) -> LinearCombination<E> {
+        coeff.negate();
+
+        self + (coeff, var)
+    }
+}
+
+impl<'a, E: Engine> Add<&'a LinearCombination<E>> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn add(mut self, other: &'a LinearCombination<E>) -> LinearCombination<E> {
+        for &(k, v) in other.0.iter() {
+            self = self + (v, Variable(k));
+        }
+
+        self
+    }
+}
+
+impl<'a, E: Engine> Sub<&'a LinearCombination<E>> for LinearCombination<E> {
+    type Output = LinearCombination<E>;
+
+    fn sub(mut self, other: &'a LinearCombination<E>) -> LinearCombination<E> {
+        for &(k, v) in other.0.iter() {
+            self = self - (v, Variable(k));
+        }
+
+        self
+    }
+}
+
+pub trait Circuit<E: Engine> {
+    type InputMap: Input<E>;
+
+    /// Synthesize the circuit into a rank-1 quadratic constraint system
+    #[must_use]
+    fn synthesize<CS: ConstraintSystem<E>>(self, cs: &mut CS) -> Result<Self::InputMap, Error>;
+}
+
+pub trait Input<E: Engine> {
+    /// Synthesize the circuit, except with additional access to public input
+    /// variables
+    fn synthesize<CS: PublicConstraintSystem<E>>(self, cs: &mut CS) -> Result<(), Error>;
+}
+
+pub trait PublicConstraintSystem<E: Engine>: ConstraintSystem<E> {
+    /// Allocate a public input that the verifier knows. The provided function is used to
+    /// determine the assignment of the variable.
+    fn alloc_input<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, f: F) -> Result<Variable, Error>;
+}
+
+pub trait ConstraintSystem<E: Engine> {
+    /// Return the "one" input variable
+    fn one() -> Variable {
+        Variable(Index::Input(0))
+    }
+
+    /// Allocate a private variable in the constraint system. The provided function is used to
+    /// determine the assignment of the variable.
+    fn alloc<F: FnOnce() -> Result<E::Fr, Error>>(&mut self, f: F) -> Result<Variable, Error>;
+
+    /// Enforce that `A` * `B` = `C`.
+    fn enforce(
+        &mut self,
+        a: LinearCombination<E>,
+        b: LinearCombination<E>,
+        c: LinearCombination<E>
+    );
+}
diff --git a/src/multicore.rs b/src/multicore.rs
new file mode 100644
index 0000000..632c5a5
--- /dev/null
+++ b/src/multicore.rs
@@ -0,0 +1,53 @@
+use crossbeam::{self, Scope, ScopedJoinHandle};
+use num_cpus;
+
+pub enum MaybeJoinHandle<T> {
+    MultiThreaded(ScopedJoinHandle<T>),
+    SingleThreaded(T)
+}
+
+impl<T> MaybeJoinHandle<T> {
+    pub fn join(self) -> T {
+        match self {
+            MaybeJoinHandle::MultiThreaded(scope) => scope.join(),
+            MaybeJoinHandle::SingleThreaded(t) => t
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub enum MaybeScope<'a, 'b: 'a> {
+    MultiThreaded(&'a Scope<'b>),
+    SingleThreaded
+}
+
+impl<'a, 'b> MaybeScope<'a, 'b> {
+    pub fn spawn<F, T>(&self, f: F) -> MaybeJoinHandle<T>
+        where F: FnOnce() -> T + Send + 'b, T: Send + 'b
+    {
+        match self {
+            &MaybeScope::MultiThreaded(scope) => MaybeJoinHandle::MultiThreaded(scope.spawn(f)),
+            &MaybeScope::SingleThreaded => MaybeJoinHandle::SingleThreaded(f())
+        }
+    }
+}
+
+pub fn scope<'a, F, R>(
+    elements: usize,
+    f: F
+) -> R where F: for<'b> FnOnce(MaybeScope<'b, 'a>, usize) -> R
+{
+    let num_cpus = num_cpus::get();
+
+    if elements <= num_cpus {
+        if elements == 0 {
+            f(MaybeScope::SingleThreaded, 1)
+        } else {
+            f(MaybeScope::SingleThreaded, elements)
+        }
+    } else {
+        crossbeam::scope(|scope| {
+            f(MaybeScope::MultiThreaded(scope), elements / num_cpus)
+        })
+    }
+}
diff --git a/src/multiexp.rs b/src/multiexp.rs
new file mode 100644
index 0000000..a1453f0
--- /dev/null
+++ b/src/multiexp.rs
@@ -0,0 +1,293 @@
+use pairing::*;
+use std::sync::Arc;
+use std::io;
+use bit_vec::{self, BitVec};
+use std::iter;
+use futures::{BoxFuture, Future};
+use futures_cpupool::CpuPool;
+
+use super::Error;
+
+/// An object that builds a source of bases.
+pub trait SourceBuilder<G: CurveAffine>: Send + Sync + 'static + Clone {
+    type Source: Source<G>;
+
+    fn new(self) -> Self::Source;
+}
+
+/// A source of bases, like an iterator.
+pub trait Source<G: CurveAffine> {
+    /// Parses the element from the source. Fails if the point is at infinity.
+    fn add_assign_mixed(&mut self, to: &mut <G as CurveAffine>::Projective) -> Result<(), Error>;
+
+    /// Skips `amt` elements from the source, avoiding deserialization.
+    fn skip(&mut self, amt: usize) -> Result<(), Error>;
+}
+
+impl<G: CurveAffine> SourceBuilder<G> for (Arc<Vec<G>>, usize) {
+    type Source = (Arc<Vec<G>>, usize);
+
+    fn new(self) -> (Arc<Vec<G>>, usize) {
+        (self.0.clone(), self.1)
+    }
+}
+
+impl<G: CurveAffine> Source<G> for (Arc<Vec<G>>, usize) {
+    fn add_assign_mixed(&mut self, to: &mut <G as CurveAffine>::Projective) -> Result<(), Error> {
+        if self.0.len() <= self.1 {
+            return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "expected more bases from source").into());
+        }
+
+        if self.0[self.1].is_zero() {
+            return Err(Error::UnexpectedIdentity)
+        }
+
+        to.add_assign_mixed(&self.0[self.1]);
+
+        self.1 += 1;
+
+        Ok(())
+    }
+
+    fn skip(&mut self, amt: usize) -> Result<(), Error> {
+        if self.0.len() <= self.1 {
+            return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "expected more bases from source").into());
+        }
+
+        self.1 += amt;
+
+        Ok(())
+    }
+}
+
+pub trait QueryDensity {
+    /// Returns whether the base exists.
+    type Iter: Iterator<Item=bool>;
+
+    fn iter(self) -> Self::Iter;
+    fn get_query_size(self) -> Option<usize>;
+}
+
+#[derive(Clone)]
+pub struct FullDensity;
+
+impl AsRef<FullDensity> for FullDensity {
+    fn as_ref(&self) -> &FullDensity {
+        self
+    }
+}
+
+impl<'a> QueryDensity for &'a FullDensity {
+    type Iter = iter::Repeat<bool>;
+
+    fn iter(self) -> Self::Iter {
+        iter::repeat(true)
+    }
+
+    fn get_query_size(self) -> Option<usize> {
+        None
+    }
+}
+
+pub struct DensityTracker {
+    bv: BitVec,
+    total_density: usize
+}
+
+impl<'a> QueryDensity for &'a DensityTracker {
+    type Iter = bit_vec::Iter<'a>;
+
+    fn iter(self) -> Self::Iter {
+        self.bv.iter()
+    }
+
+    fn get_query_size(self) -> Option<usize> {
+        Some(self.bv.len())
+    }
+}
+
+impl DensityTracker {
+    pub fn new() -> DensityTracker {
+        DensityTracker {
+            bv: BitVec::new(),
+            total_density: 0
+        }
+    }
+
+    pub fn add_element(&mut self) {
+        self.bv.push(false);
+    }
+
+    pub fn inc(&mut self, idx: usize) {
+        if !self.bv.get(idx).unwrap() {
+            self.bv.set(idx, true);
+            self.total_density += 1;
+        }
+    }
+
+    pub fn get_total_density(&self) -> usize {
+        self.total_density
+    }
+}
+
+fn multiexp_inner<Q, D, G, S>(
+    pool: &CpuPool,
+    bases: S,
+    density_map: D,
+    exponents: Arc<Vec<<<G::Engine as Engine>::Fr as PrimeField>::Repr>>,
+    mut skip: u32,
+    c: u32,
+    handle_trivial: bool
+) -> BoxFuture<<G as CurveAffine>::Projective, Error>
+    where for<'a> &'a Q: QueryDensity,
+          D: Send + Sync + 'static + Clone + AsRef<Q>,
+          G: CurveAffine,
+          S: SourceBuilder<G>
+{
+    // Perform this region of the multiexp
+    let this = {
+        let bases = bases.clone();
+        let exponents = exponents.clone();
+        let density_map = density_map.clone();
+
+        pool.spawn_fn(move || {
+            // Accumulate the result
+            let mut acc = G::Projective::zero();
+
+            // Build a source for the bases
+            let mut bases = bases.new();
+
+            // Create space for the buckets
+            let mut buckets = vec![<G as CurveAffine>::Projective::zero(); (1 << c) - 1];
+
+            let zero = <G::Engine as Engine>::Fr::zero().into_repr();
+            let one = <G::Engine as Engine>::Fr::one().into_repr();
+
+            // Sort the bases into buckets
+            for (&exp, density) in exponents.iter().zip(density_map.as_ref().iter()) {
+                if density {
+                    if exp == zero {
+                        bases.skip(1)?;
+                    } else if exp == one {
+                        if handle_trivial {
+                            bases.add_assign_mixed(&mut acc)?;
+                        } else {
+                            bases.skip(1)?;
+                        }
+                    } else {
+                        let mut exp = exp;
+                        exp.divn(skip);
+                        let exp = exp.as_ref()[0] % (1 << c);
+
+                        if exp != 0 {
+                            bases.add_assign_mixed(&mut buckets[(exp - 1) as usize])?;
+                        } else {
+                            bases.skip(1)?;
+                        }
+                    }
+                }
+            }
+
+            // Summation by parts
+            // e.g. 3a + 2b + 1c = a +
+            //                    (a) + b +
+            //                    ((a) + b) + c
+            let mut running_sum = G::Projective::zero();
+            for exp in buckets.into_iter().rev() {
+                running_sum.add_assign(&exp);
+                acc.add_assign(&running_sum);
+            }
+
+            Ok(acc)
+        })
+    };
+
+    skip += c;
+
+    if skip >= <G::Engine as Engine>::Fr::num_bits() {
+        // There isn't another region.
+        this.boxed()
+    } else {
+        // There's another region more significant. Calculate and join it with
+        // this region recursively.
+        this.join(multiexp_inner(pool, bases, density_map, exponents, skip, c, false))
+            .map(move |(this, mut higher)| {
+                for _ in 0..c {
+                    higher.double();
+                }
+
+                higher.add_assign(&this);
+
+                higher
+            })
+            .boxed()
+    }
+}
+
+/// Perform multi-exponentiation. The caller is responsible for ensuring the
+/// query size is the same as the number of exponents.
+pub fn multiexp<Q, D, G, S>(
+    pool: &CpuPool,
+    bases: S,
+    density_map: D,
+    exponents: Arc<Vec<<<G::Engine as Engine>::Fr as PrimeField>::Repr>>,
+// TODO
+//    c: u32
+) -> BoxFuture<<G as CurveAffine>::Projective, Error>
+    where for<'a> &'a Q: QueryDensity,
+          D: Send + Sync + 'static + Clone + AsRef<Q>,
+          G: CurveAffine,
+          S: SourceBuilder<G>
+{
+    if let Some(query_size) = density_map.as_ref().get_query_size() {
+        // If the density map has a known query size, it should not be
+        // inconsistent with the number of exponents.
+
+        assert!(query_size == exponents.len());
+    }
+
+    multiexp_inner(pool, bases, density_map, exponents, 0, 12, true)
+}
+
+#[test]
+fn test_with_bls12() {
+    fn naive_multiexp<G: CurveAffine>(
+        bases: Arc<Vec<G>>,
+        exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>
+    ) -> G::Projective
+    {
+        assert_eq!(bases.len(), exponents.len());
+
+        let mut acc = G::Projective::zero();
+
+        for (base, exp) in bases.iter().zip(exponents.iter()) {
+            acc.add_assign(&base.mul(*exp));
+        }
+
+        acc
+    }
+
+    use rand::{self, Rand};
+    use pairing::bls12_381::Bls12;
+
+    const SAMPLES: usize = 1 << 17;
+
+    let rng = &mut rand::thread_rng();
+    let v = Arc::new((0..SAMPLES).map(|_| <Bls12 as Engine>::Fr::rand(rng).into_repr()).collect::<Vec<_>>());
+    let g = Arc::new((0..SAMPLES).map(|_| <Bls12 as Engine>::G1::rand(rng).into_affine()).collect::<Vec<_>>());
+
+    let naive = naive_multiexp(g.clone(), v.clone());
+
+    let pool = CpuPool::new_num_cpus();
+
+    let fast = multiexp(
+        &pool,
+        (g, 0),
+        FullDensity,
+        v,
+        // TODO
+        //7
+    ).wait().unwrap();
+
+    assert_eq!(naive, fast);
+}