diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 291f461da3..7a91dc00f3 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -54,13 +54,38 @@ function generate_pte_file() { pte_file=$(realpath ${model_filename}) rm -f "${pte_file}" - SO_LIB=$(buck2 build //kernels/quantized:aot_lib --show-output | grep "buck-out" | cut -d" " -f2) + # This works, but blows up in ci because TMPDIR is long (as a choice of buck2 implementation) + # and sccache fails to open a unix domain socket under this directory . + # SO_LIB=$(buck2 build //kernels/quantized:aot_lib --show-output | grep "buck-out" | cut -d" " -f2) + # We are using the aot_lib from build_quantization_aot_lib below + SO_LIB=$(find -name libquantized_ops_aot_lib.so) python3 -m examples.arm.aot_arm_compiler --model_name="${model}" ${delegate} --so_library="$SO_LIB" 1>&2 [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "${pte_file}" } +function build_quantization_aot_lib() +{ + SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch" + + cd $et_root_dir + mkdir -p cmake-out-aot-lib + cmake -DBUCK2=${buck2} \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \ + -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ + -DPYTHON_EXECUTABLE=python3 \ + -Bcmake-out-aot-lib \ + "${et_root_dir}" + + n=$(nproc) + cmake --build cmake-out-aot-lib -j"$((n - 5))" -- quantized_ops_aot_lib +} + # build ExecuTorch Libraries function build_executorch() { set -x @@ -173,6 +198,7 @@ type ${buck2} 2>&1 > /dev/null \ # build executorch libraries build_executorch +build_quantization_aot_lib # the test models run, and whether to delegate test_model=( "softmax" "add" "add3" "mv2" ) diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt index c2b611df77..f2157b77a2 100644 --- a/kernels/quantized/CMakeLists.txt +++ b/kernels/quantized/CMakeLists.txt @@ -59,6 +59,7 @@ if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL) # Build a AOT library to register quantized ops into PyTorch. This is a hack. set(_quantized_sources ${_quantized_kernels__srcs} + ${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp ${EXECUTORCH_ROOT}/runtime/core/exec_aten/util/tensor_util_aten.cpp) gen_custom_ops_aot_lib("quantized_ops_aot_lib" "${_quantized_sources}") endif() diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml index ca2360b7d8..fdc9199148 100644 --- a/kernels/quantized/quantized.yaml +++ b/kernels/quantized/quantized.yaml @@ -34,30 +34,6 @@ - arg_meta: null kernel_name: torch::executor::dequantize_per_channel_out -- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: torch::executor::quantized_embedding_byte_out - -- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: torch::executor::quantized_embedding_byte_dtype_out - -- func: quantized_decomposed::embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: torch::executor::quantized_embedding_4bit_out - -- func: quantized_decomposed::embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: torch::executor::quantized_embedding_4bit_dtype_out - - func: quantized_decomposed::mixed_mm.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: