Merge pull request #1 from spider-gazelle/gpu-delegate

feat: add GPU delegate support
spider-gazelle · May 22, 2024 · 0c01010 · 0c01010
2 parents 6f392a5 + ea8529f
commit 0c01010
Show file tree

Hide file tree

Showing 14 changed files with 462 additions and 29 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,130 @@
+# Use an image supported by https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html
+FROM ubuntu:22.04 as build
+
+# Install necessary packages
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    wget \
+    unzip \
+    libtool \
+    zlib1g-dev \
+    vim-common \
+    curl \
+    unzip \
+    zlib1g \
+    python3 \
+    python3-pip \
+    python3-dev \
+    libopenblas-dev \
+    opencl-headers \
+    clinfo \
+    ocl-icd-opencl-dev \
+    clang \
+    libclang-dev \
+    libc++-dev \
+    linux-headers-generic \
+    software-properties-common \
+    libabsl-dev \
+    libusb-1.0-0-dev \
+    gnupg2 && \
+    apt-get clean
+
+# Install Bazelisk for building TensorFlow
+ARG TARGETARCH
+RUN wget -O /usr/local/bin/bazel https://github.com/bazelbuild/bazelisk/releases/download/v1.10.1/bazelisk-linux-$TARGETARCH && \
+    chmod +x /usr/local/bin/bazel
+
+ENV TMP=/tmp
+
+# Clone TensorFlow repository
+# https://www.tensorflow.org/install/source#gpu (lib compatibility list)
+RUN git clone --depth 1 --branch "v2.16.1" https://github.com/tensorflow/tensorflow
+
+# =======================
+# build edge TPU delegate
+# =======================
+
+WORKDIR /tensorflow
+RUN git clone https://github.com/google-coral/libedgetpu
+WORKDIR /tensorflow/libedgetpu
+
+# Build TensorFlow Lite GPU delegate (excluding Android, linux only)
+ARG TARGETPLATFORM
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+        make libedgetpu-direct CPU=k8 && \
+        mkdir -p /usr/local/lib && \
+        cp /tensorflow/libedgetpu/out/direct/k8/libedgetpu.so.1.0 /usr/local/lib/libedgetpu.so; \
+    elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        make libedgetpu-direct CPU=aarch64 && \
+        mkdir -p /usr/local/lib && \
+        cp /tensorflow/libedgetpu/out/direct/aarch64/libedgetpu.so.1.0 /usr/local/lib/libedgetpu.so; \
+    else \
+        echo "Unknown platform"; \
+        exit 1; \
+    fi
+
+# ==================================
+# Build tensorflow lite GPU delegate
+# ==================================
+
+WORKDIR /tensorflow
+
+# Configure TensorFlow build (excluding Android)
+RUN ./configure <<EOF
+
+
+
+
+
+
+
+
+EOF
+
+# Build TensorFlow Lite GPU delegate (excluding Android, linux only)
+RUN bazel build //tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so \
+    --config=opt \
+    --config=monolithic \
+    --copt=-g \
+    --cxxopt=-std=c++17 \
+    --copt=-DMESA_EGL_NO_X11_HEADERS \
+    --copt=-DEGL_NO_X11 \
+    --copt=-DCL_DELEGATE_NO_GL \
+    --define=with_xla_support=false \
+    --define=with_flex_support=false \
+    --define=no_tensorflow_py_deps=true \
+    --config=noaws \
+    --config=nogcp \
+    --config=nohdfs \
+    --verbose_failures
+
+# Copy the built shared libraries to /usr/local/lib
+RUN mkdir -p /usr/local/lib && \
+    cp bazel-bin/tensorflow/lite/delegates/gpu/libtensorflowlite_gpu_delegate.so /usr/local/lib/
+
+# =================================
+# Build tensorflow lite using cmake
+# =================================
+
+RUN mkdir tflite_build
+WORKDIR /tensorflow/tflite_build
+RUN cmake /tensorflow/tensorflow/lite/c -DTFLITE_ENABLE_GPU=ON
+RUN cmake --build . -j4 || true
+RUN echo "---------- WE ARE BUILDING AGAIN!! ----------"
+RUN cmake --build . -j1
+
+# copy the shard lib into place
+RUN cp ./libtensorflowlite_c.so /usr/local/lib/
+
+# ======================
+# Set up the final stage
+# ======================
+FROM scratch
+
+# Copy the built libraries from the build stage
+COPY --from=build /usr/local/lib/libedgetpu.so /usr/local/lib/libedgetpu.so
+COPY --from=build /usr/local/lib/libtensorflowlite_c.so /usr/local/lib/libtensorflowlite_c.so
+COPY --from=build /usr/local/lib/libtensorflowlite_gpu_delegate.so /usr/local/lib/libtensorflowlite_gpu_delegate.so
diff --git a/README.md b/README.md
@@ -78,6 +78,35 @@ To update tensorflow lite bindings `./generate_bindings.sh`
 
 ### lib installation
 
+#### Dockerfile
+
+The dockerfile is used to build a compatible tensorflow build for target platforms.
+There is an image pre-built at `docker pull stakach/tensorflowlite:latest`
+
+To build an image run:
+
+```shell
+docker buildx build --progress=plain --platform linux/arm64,linux/amd64 -t stakach/tensorflowlite:latest --push .
+```
+
+to extract the libraries
+
+```shell
+mkdir -p ./ext
+docker pull stakach/tensorflowlite:latest
+docker create --name tflite_tmp stakach/tensorflowlite:latest true
+
+docker cp tflite_tmp:/usr/local/lib/libedgetpu.so ./ext/libedgetpu.so
+docker cp tflite_tmp:/usr/local/lib/libtensorflowlite_c.so ./ext/libtensorflowlite_c.so
+docker cp tflite_tmp:/usr/local/lib/libtensorflowlite_gpu_delegate.so ./ext/libtensorflowlite_gpu_delegate.so
+
+docker rm tflite_tmp
+```
+
+this operation is performed post-install by this library
+
+#### Old method
+
 Requires [libtensorflow](https://www.tensorflow.org/install/lang_c) to be installed, this is handled automatically by `./build_tensorflowlite.sh`
 
 * there is a [guide to building it](https://www.tensorflow.org/lite/guide/build_cmake)

diff --git a/bindings/bindings_generator.cr b/bindings/bindings_generator.cr
@@ -3,6 +3,8 @@
   "tensorflow/lite/core/c/c_api_types.h",
   "tensorflow/lite/core/c/c_api.h",
   "tensorflow/lite/core/c/c_api_experimental.h",
+  "tensorflow/lite/delegates/gpu/delegate_options.h",
+  "tensorflow/lite/delegates/gpu/delegate.h",
   flags: "
     -I/{tensorflow_dir}/tensorflow/
     -I/{tensorflow_dir}/tensorflow/bazel-genfiles

diff --git a/build_tensorflowlite.sh b/build_tensorflowlite.sh
@@ -22,13 +22,14 @@ echo "--"
 
 mkdir tflite_build
 cd tflite_build
-cmake ../tensorflow/tensorflow/lite/c -DTFLITE_ENABLE_GPU=ON
+cmake ../tensorflow/tensorflow/lite/c \
+  -DTFLITE_ENABLE_GPU=ON
 
 echo "--"
 echo "building..."
 echo "--"
 
-cmake --build . -j2 || true
+cmake --build . -j3 || true
 
 FILE=./libtensorflowlite_c.so
 if test -f "$FILE"; then

diff --git a/install_tensorflowlite.sh b/install_tensorflowlite.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+SHARDS_INSTALL=IS_LIB
+IS_LOCAL=./ext/libtensorflowlite_c.so
+if test -f "$IS_LOCAL"; then
+  echo "--"
+  echo "tensorflow lite library installed, skipping installation"
+  echo "--"
+  exit 0
+fi
+
+echo "--"
+echo "downloading images... (requires docker)"
+echo "--"
+
+mkdir -p ./ext
+docker pull stakach/tensorflowlite:latest
+docker create --name tflite_tmp stakach/tensorflowlite:latest true
+
+echo "--"
+echo "copying library into place.."
+echo "--"
+
+docker cp tflite_tmp:/usr/local/lib/libedgetpu.so ./ext/libedgetpu.so
+docker cp tflite_tmp:/usr/local/lib/libtensorflowlite_c.so ./ext/libtensorflowlite_c.so
+docker cp tflite_tmp:/usr/local/lib/libtensorflowlite_gpu_delegate.so ./ext/libtensorflowlite_gpu_delegate.so
+docker rm tflite_tmp
+
+# we'll put the lib into a few different places so it'll run when using crystal normally
+
+# Temp location crystal runs applications from
+mkdir -p ~/.cache/crystal/
+cp ./ext/libedgetpu.so ~/.cache/crystal/
+cp ./ext/libtensorflowlite_c.so ~/.cache/crystal/
+cp ./ext/libtensorflowlite_gpu_delegate.so ~/.cache/crystal/
+
+# other locations you might be running the application from
+# check if being installed as a lib
+if [ "$1" = "$SHARDS_INSTALL" ]; then
+  echo "copying into parent directory.."
+  mkdir -p ../../bin
+  ln -s ./ext/libedgetpu.so ../../bin/libedgetpu.so
+  ln -s ./ext/libedgetpu.so ../../libedgetpu.so
+
+  ln -s ./ext/libtensorflowlite_c.so ../../bin/libtensorflowlite_c.so
+  ln -s ./ext/libtensorflowlite_c.so ../../libtensorflowlite_c.so
+
+  ln -s ./ext/libtensorflowlite_gpu_delegate.so ../../bin/libtensorflowlite_gpu_delegate.so
+  ln -s ./ext/libtensorflowlite_gpu_delegate.so ../../libtensorflowlite_gpu_delegate.so
+else
+  echo "run manually, assuming library development"
+fi
+
+echo "--"
+echo "Done"
diff --git a/shard.yml b/shard.yml
@@ -11,4 +11,4 @@ authors:
 license: MIT
 
 scripts:
-  postinstall: /bin/sh ./build_tensorflowlite.sh IS_LIB
+  postinstall: /bin/sh ./install_tensorflowlite.sh IS_LIB
diff --git a/spec/tensorflow_lite_spec.cr b/spec/tensorflow_lite_spec.cr
@@ -112,16 +112,91 @@ module TensorflowLite
       client.labels.as(Array(String)).size.should eq 90
     end
 
+    describe TensorflowLite do
+      it "can add a GPU delegate" do
+        # it will fallback to CPU for this test if there is no hardware installed
+        # however at least we know it compiles
+        file_io = File.new(model_path)
+        file_data = Bytes.new(file_io.size)
+        file_io.read_fully(file_data)
+        file_io.close
+
+        {Model.new(model_path), Model.new(file_data)}.each do |model|
+          opts = InterpreterOptions.new
+          opts.on_error do |error_msg|
+            puts "error was #{error_msg}"
+          end
+          interpreter = Interpreter.new(model, opts)
+
+          gpu = DelegateGPU.new
+          interpreter.modify_graph_with_delegate gpu
+
+          xor_test.each do |test|
+            inputs = test[:input]
+            expected = test[:result]
+
+            # configure inputs
+            input_tensor = interpreter.input_tensor(0)
+            input_tensor.raw_data.bytesize.should eq input_tensor.bytesize
+            input_tensor.size.should eq 2
+
+            floats = input_tensor.as_f32
+            floats[0], floats[1] = inputs
+
+            # run through NN
+            interpreter.invoke!
+
+            # check results
+            output_tensor = interpreter.output_tensor(0)
+            floats = output_tensor.as_f32
+            result = (floats[0] + 0.5_f32).to_i
+
+            result.should eq expected
+          end
+        end
+      end
+
+      it "can add a GPU delegate to the client" do
+        # it will fallback to CPU for this test if there is no hardware installed
+        file_io = File.new(model_path)
+        file_data = Bytes.new(file_io.size)
+        file_io.read_fully(file_data)
+        file_io.close
+
+        {Model.new(model_path), Model.new(file_data)}.each do |model|
+          client = TensorflowLite::Client.new(model, delegate: DelegateGPU.new)
+
+          xor_test.each do |test|
+            inputs = test[:input]
+            expected = test[:result]
+
+            # configure inputs
+            floats = client[0].as_f32
+            floats[0], floats[1] = inputs
+
+            # run through NN
+            client.invoke!
+
+            # check results
+            floats = client.output.as_f32
+            result = (floats[0] + 0.5_f32).to_i
+
+            result.should eq expected
+          end
+        end
+      end
+    end
+
     it "works with quantized models" do
-      model_path = Path.new File.join(__DIR__, "./test_data/xor_model_quantized.tflite")
+      quant_path = Path.new File.join(__DIR__, "./test_data/xor_model_quantized.tflite")
       quantized_test = {
         {input: {-128_i8, -128_i8}, result: 0},
         {input: {127_i8, -128_i8}, result: 1},
         {input: {-128_i8, 127_i8}, result: 1},
         {input: {127_i8, 127_i8}, result: 0},
       }
 
-      client = TensorflowLite::Client.new(model_path)
+      client = TensorflowLite::Client.new(quant_path)
 
       quantized_test.each do |test|
         inputs = test[:input]

diff --git a/src/tensorflow_lite.cr b/src/tensorflow_lite.cr
@@ -29,4 +29,5 @@ require "./tensorflow_lite/interpreter_options"
 require "./tensorflow_lite/tensor"
 require "./tensorflow_lite/interpreter"
 require "./tensorflow_lite/client"
+require "./tensorflow_lite/delegate_gpu"
 require "./tensorflow_lite/utilities/*"
diff --git a/src/tensorflow_lite/bindings_generator.cr b/src/tensorflow_lite/bindings_generator.cr
@@ -0,0 +1,17 @@
+@[Include(
+  "tensorflow/lite/builtin_ops.h",
+  "tensorflow/lite/core/c/c_api_types.h",
+  "tensorflow/lite/core/c/c_api.h",
+  "tensorflow/lite/core/c/c_api_experimental.h",
+  "tensorflow/lite/delegates/gpu/delegate_options.h",
+  "tensorflow/lite/delegates/gpu/delegate.h",
+  flags: "
+    -I/{tensorflow_dir}/tensorflow/
+    -I/{tensorflow_dir}/tensorflow/bazel-genfiles
+    -I/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1
+  ",
+  prefix: %w(TFL_ TfLite kTfLite)
+)]
+@[Link("tensorflowlite_c", ldflags: "-L#{__DIR__}/../../ext/ -Wl,-rpath='$ORIGIN'")]
+lib LibTensorflowLite
+end
diff --git a/src/tensorflow_lite/client.cr b/src/tensorflow_lite/client.cr
@@ -47,10 +47,15 @@ class TensorflowLite::Client
     if threads
       @options.num_threads(threads)
     end
-    if delegate
-      @options.add_delegate delegate
+
+    case delegate
+    when DelegateGPU
+      @interpreter = Interpreter.new(@model, @options)
+      @interpreter.modify_graph_with_delegate delegate
+    else
+      @options.add_delegate(delegate) if delegate
+      @interpreter = Interpreter.new(@model, @options)
     end
-    @interpreter = Interpreter.new(@model, @options)
   end
 
   getter model : Model