diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 291f461da3..7a91dc00f3 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -54,13 +54,38 @@ function generate_pte_file() {
     pte_file=$(realpath ${model_filename})
     rm -f "${pte_file}"
 
-    SO_LIB=$(buck2 build //kernels/quantized:aot_lib --show-output | grep "buck-out" | cut -d" " -f2)
+    # This works, but blows up in ci because TMPDIR is long (as a choice of buck2 implementation)
+    # and sccache fails to open a unix domain socket under this directory .
+    # SO_LIB=$(buck2 build //kernels/quantized:aot_lib --show-output | grep "buck-out" | cut -d" " -f2)
+    # We are using the aot_lib from build_quantization_aot_lib below
+    SO_LIB=$(find -name libquantized_ops_aot_lib.so)
 
     python3 -m examples.arm.aot_arm_compiler --model_name="${model}" ${delegate} --so_library="$SO_LIB" 1>&2
     [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "${pte_file}"
 }
 
+function build_quantization_aot_lib()
+{
+    SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+
+    cd $et_root_dir
+    mkdir -p cmake-out-aot-lib
+    cmake -DBUCK2=${buck2} \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
+        -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+        -DPYTHON_EXECUTABLE=python3 \
+	-Bcmake-out-aot-lib \
+        "${et_root_dir}"
+
+    n=$(nproc)
+    cmake --build cmake-out-aot-lib -j"$((n - 5))" -- quantized_ops_aot_lib
+}
+
 # build ExecuTorch Libraries
 function build_executorch() {
     set -x
@@ -173,6 +198,7 @@ type ${buck2} 2>&1 > /dev/null \
 
 # build executorch libraries
 build_executorch
+build_quantization_aot_lib
 
 # the test models run, and whether to delegate
 test_model=( "softmax" "add" "add3" "mv2" )
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index c2b611df77..f2157b77a2 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -59,6 +59,7 @@ if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
   # Build a AOT library to register quantized ops into PyTorch. This is a hack.
   set(_quantized_sources
       ${_quantized_kernels__srcs}
+      ${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp
       ${EXECUTORCH_ROOT}/runtime/core/exec_aten/util/tensor_util_aten.cpp)
   gen_custom_ops_aot_lib("quantized_ops_aot_lib" "${_quantized_sources}")
 endif()
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
index ca2360b7d8..fdc9199148 100644
--- a/kernels/quantized/quantized.yaml
+++ b/kernels/quantized/quantized.yaml
@@ -34,30 +34,6 @@
     - arg_meta: null
       kernel_name: torch::executor::dequantize_per_channel_out
 
-- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_byte_out
-
-- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_byte_dtype_out
-
-- func: quantized_decomposed::embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_4bit_out
-
-- func: quantized_decomposed::embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::quantized_embedding_4bit_dtype_out
-
 - func: quantized_decomposed::mixed_mm.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels: