diff --git a/halo2_proofs/src/multicore.rs b/halo2_proofs/src/multicore.rs
index 60502f07d2..fa11d7e627 100644
--- a/halo2_proofs/src/multicore.rs
+++ b/halo2_proofs/src/multicore.rs
@@ -16,7 +16,7 @@ pub use maybe_rayon::{
 pub use maybe_rayon::{
     current_num_threads,
     iter::{IndexedParallelIterator, IntoParallelRefIterator},
-    slice::ParallelSliceMut,
+    slice::{ParallelSlice, ParallelSliceMut},
 };
 
 #[cfg(not(feature = "multicore"))]
@@ -30,7 +30,7 @@ pub trait TryFoldAndReduce<T, E> {
     /// disabled.
     /// The `try_fold_and_reduce` function can only be called by a iter with
     /// `Result<T, E>` item type because the `fold_op` must meet the trait
-    /// bounds of both `try_fold` and `try_reduce` from rayon.   
+    /// bounds of both `try_fold` and `try_reduce` from rayon.
     fn try_fold_and_reduce(
         self,
         identity: impl Fn() -> T + Send + Sync,
diff --git a/halo2_proofs/src/plonk/lookup/prover.rs b/halo2_proofs/src/plonk/lookup/prover.rs
index 0e3371790e..7e50ff1c30 100644
--- a/halo2_proofs/src/plonk/lookup/prover.rs
+++ b/halo2_proofs/src/plonk/lookup/prover.rs
@@ -3,6 +3,12 @@ use super::super::{
     ProvingKey,
 };
 use super::Argument;
+use crate::multicore::{self, IntoParallelIterator};
+#[cfg(feature = "multicore")]
+use crate::multicore::{
+    IndexedParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator,
+    ParallelSliceMut,
+};
 use crate::plonk::evaluation::evaluate;
 use crate::{
     arithmetic::{eval_polynomial, parallelize, CurveAffine},
@@ -19,13 +25,6 @@ use group::{
     ff::{BatchInvert, Field},
     Curve,
 };
-use maybe_rayon::{
-    iter::{
-        IndexedParallelIterator, IntoParallelIterator, IntoParallelRefIterator,
-        IntoParallelRefMutIterator, ParallelIterator,
-    },
-    prelude::ParallelSliceMut,
-};
 use rand_core::RngCore;
 
 use std::collections::HashMap;
@@ -412,7 +411,7 @@ fn permute_expression_pair<'params, C: CurveAffine, P: Params<'params, C>, R: Rn
 where
     C::Scalar: Hash,
 {
-    let num_threads = maybe_rayon::current_num_threads();
+    let num_threads = multicore::current_num_threads();
     // heuristic on when multi-threading isn't worth it
     // for now it seems like multi-threading is often worth it
     /*if params.n() < (num_threads as u64) << 10 {
@@ -434,6 +433,8 @@ where
     let input_time = start_timer!(|| "permute_par input hashmap (cpu par)");
     // count input_expression unique values using a HashMap, using rayon parallel fold+reduce
     let capacity = usable_rows / num_threads + 1;
+
+    #[cfg(feature = "multicore")]
     let input_uniques: HashMap<C::Scalar, usize> = input_expression
         .par_iter()
         .fold(
@@ -450,11 +451,21 @@ where
             m1
         })
         .unwrap();
+    #[cfg(not(feature = "multicore"))]
+    let input_uniques: HashMap<C::Scalar, usize> =
+        input_expression
+            .iter()
+            .fold(HashMap::with_capacity(capacity), |mut acc, coeff| {
+                *acc.entry(*coeff).or_insert(0) += 1;
+                acc
+            });
     #[cfg(feature = "profile")]
     end_timer!(input_time);
 
     #[cfg(feature = "profile")]
     let timer = start_timer!(|| "permute_par input unique ranges (cpu par)");
+
+    #[cfg(feature = "multicore")]
     let input_unique_ranges = input_uniques
         .par_iter()
         .fold(
@@ -478,6 +489,19 @@ where
             [r1, r2].concat()
         })
         .unwrap();
+    #[cfg(not(feature = "multicore"))]
+    let input_unique_ranges = input_uniques.iter().fold(
+        Vec::with_capacity(capacity),
+        |mut input_ranges, (&coeff, &count)| {
+            if input_ranges.is_empty() {
+                input_ranges.push((coeff, 0..count));
+            } else {
+                let prev_end = input_ranges.last().unwrap().1.end;
+                input_ranges.push((coeff, prev_end..prev_end + count));
+            }
+            input_ranges
+        },
+    );
     #[cfg(feature = "profile")]
     end_timer!(timer);
 
@@ -488,14 +512,19 @@ where
     end_timer!(to_vec_time);
     #[cfg(feature = "profile")]
     let sort_table_time = start_timer!(|| "permute_par sort table");
+    #[cfg(feature = "multicore")]
     sorted_table_coeffs.par_sort();
+    #[cfg(not(feature = "multicore"))]
+    sorted_table_coeffs.sort();
     #[cfg(feature = "profile")]
     end_timer!(sort_table_time);
 
     #[cfg(feature = "profile")]
     let timer = start_timer!(|| "leftover table coeffs (cpu par)");
+
     let leftover_table_coeffs: Vec<C::Scalar> = sorted_table_coeffs
-        .par_iter()
+        .as_slice()
+        .into_par_iter()
         .enumerate()
         .filter_map(|(i, coeff)| {
             ((i != 0 && coeff == &sorted_table_coeffs[i - 1]) || !input_uniques.contains_key(coeff))
@@ -515,7 +544,7 @@ where
                 let leftover_range_end = range.end - i - 1;
                 [(coeff, coeff)].into_par_iter().chain(
                     leftover_table_coeffs[leftover_range_start..leftover_range_end]
-                        .par_iter()
+                        .into_par_iter()
                         .map(move |leftover_table_coeff| (coeff, *leftover_table_coeff)),
                 )
             })
@@ -551,7 +580,10 @@ fn permute_expression_pair_seq<'params, C: CurveAffine, P: Params<'params, C>, R
     permuted_input_expression.truncate(usable_rows);
 
     // Sort input lookup expression values
+    #[cfg(feature = "multicore")]
     permuted_input_expression.par_sort();
+    #[cfg(not(feature = "multicore"))]
+    permuted_input_expression.sort();
 
     // A BTreeMap of each unique element in the table expression and its count
     let mut leftover_table_map: BTreeMap<C::Scalar, u32> = table_expression
diff --git a/halo2_proofs/src/poly.rs b/halo2_proofs/src/poly.rs
index adebbf6755..7cb50aa013 100644
--- a/halo2_proofs/src/poly.rs
+++ b/halo2_proofs/src/poly.rs
@@ -12,11 +12,11 @@ use crate::helpers::SerdePrimeField;
 use crate::plonk::Assigned;
 use crate::SerdeFormat;
 
-use group::ff::{BatchInvert, Field};
-use maybe_rayon::{
-    iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator},
-    prelude::ParallelSlice,
+#[cfg(feature = "multicore")]
+use crate::multicore::{
+    IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator, ParallelSlice,
 };
+use group::ff::{BatchInvert, Field};
 
 /// Generic commitment scheme structures
 pub mod commitment;
@@ -211,7 +211,8 @@ where
         .filter_map(|d| d.as_mut())
         .batch_invert();
 
-    assigned
+    #[cfg(feature = "multicore")]
+    return assigned
         .par_iter()
         .zip(assigned_denominators.par_chunks(n))
         .map(|(poly, inv_denoms)| {
@@ -226,7 +227,25 @@ where
                 _marker: PhantomData,
             }
         })
-        .collect()
+        .collect();
+
+    #[cfg(not(feature = "multicore"))]
+    return assigned
+        .iter()
+        .zip(assigned_denominators.chunks(n))
+        .map(|(poly, inv_denoms)| {
+            debug_assert_eq!(inv_denoms.len(), poly.as_ref().len());
+            Polynomial {
+                values: poly
+                    .as_ref()
+                    .iter()
+                    .zip(inv_denoms.iter())
+                    .map(|(a, inv_den)| a.numerator() * inv_den.unwrap_or(F::ONE))
+                    .collect(),
+                _marker: PhantomData,
+            }
+        })
+        .collect();
 }
 
 impl<F: Field> Polynomial<Assigned<F>, LagrangeCoeff> {