From 1c4e976b85860794a41279d2b9714a752cd06c75 Mon Sep 17 00:00:00 2001 From: "Steve M. Kim" Date: Fri, 8 May 2015 11:07:32 -0700 Subject: [PATCH 1/4] Fix issue #17: bytes-to-array makes a copy of byte buffer The `ucar.ma2.Array/factory` method sometimes peeks at the underlying `byte[]` storage of a `ByteBuffer` argument instead of relying on the `ByteBuffer` interface. As a result, the `Array` can be constructed using bytes that are not even exposed by the `ByteBuffer`. This commit changes the implementation of the `bytes-to-array` function in the `io.mandoline.impl` namespace to be robust against this bug. If the byte buffer is not perfectly aligned with its underlying byte array, then a copy is created. The copy is backed by a byte array that only contains the bytes that the original byte buffer exposes, starting from its current position and ending at its limit. --- src/io/mandoline/impl.clj | 19 +++++++++++++--- test/io/mandoline/impl_test.clj | 39 +++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/io/mandoline/impl.clj b/src/io/mandoline/impl.clj index a35e06a..01b37f6 100644 --- a/src/io/mandoline/impl.clj +++ b/src/io/mandoline/impl.clj @@ -182,11 +182,24 @@ :index (wrappers-fun (proto/index connection var-name mdata options) res)))) -(defn- ^ucar.ma2.Array bytes-to-array +(defn ^ucar.ma2.Array bytes-to-array "Coerces a byte-buffer to a new Array with given data type and shape." [^ByteBuffer byte-buffer ^DataType dtype shape] - ;; todo do we have unnecessary copying here? - (Array/factory dtype (int-array shape) byte-buffer)) + (locking byte-buffer + (let [p (.position byte-buffer) + r (.remaining byte-buffer) + ; If the ByteBuffer is not perfectly aligned with its + ; underlying byte[] storage, make a copy + bb (if-not (and (= 0 (.arrayOffset byte-buffer) p) + (= (alength (.array byte-buffer)) r)) + (let [dst (byte-array r)] + (.get byte-buffer dst) + (ByteBuffer/wrap dst)) + byte-buffer) + array (Array/factory dtype (int-array shape) bb)] + ; Resume original position + (.position byte-buffer p) + array))) (defn hash->slab [hash chunk-store dtype slice] (assert hash) diff --git a/test/io/mandoline/impl_test.clj b/test/io/mandoline/impl_test.clj index 98fd528..736b903 100644 --- a/test/io/mandoline/impl_test.clj +++ b/test/io/mandoline/impl_test.clj @@ -1,11 +1,15 @@ -(ns io.mandoline.impl-test (:require +(ns io.mandoline.impl-test + (:require [clojure.test :refer :all] [io.mandoline [chunk :as chunk] [impl :as impl] [slab :as slab] [slice :as slice]] - [io.mandoline.impl.protocol :as proto])) + [io.mandoline.impl.protocol :as proto]) + (:import + [java.nio ByteBuffer] + [ucar.ma2 Array DataType])) ;; Verify that when writing a variable, we don't repeatedly write an identical @@ -33,3 +37,34 @@ ;; Even though we were writing thousands of chunks, they were all the same, ;; and hopefully we didn't write that one chunk too many times. (is (< 0 @chunks-written 100))))) + +(deftest test-bytes-to-array + (let [elements (map byte (range 30)) + ; Create a ByteBuffer instance that is backed by a 25-element + ; byte array, with position=3 and limit=19. Diagram: + ; + ; byte[25] ========================= + ; slice ===================== + ; limit =================== + ; position ================ + byte-buffer (-> (byte-array elements) + (ByteBuffer/wrap) + (.position 4) ; advance position by 4 + (.slice) ; create a slice that shares the underlying byte[] + (.limit 19) ; trim to 19 long + (.position 3)) ; advance position by 3 + data-type DataType/BYTE + shape [2 4 2] + array (impl/bytes-to-array byte-buffer data-type shape)] + (is (instance? Array array) + "bytes-to-array returns an instance of ucar.ma2.Array") + (is (= (.getClassType data-type) (.getElementType array)) + "bytes-to-array returns an Array with the expected element type") + (is (= shape (vec (.getShape array))) + "bytes-to-array returns an Array with the expected shape") + (is (= (take (- 19 3) (drop (+ 4 3) elements)) (seq (.copyTo1DJavaArray array))) + "bytes-to-array returns an Array that contains the expected elements") + (is (= 3 (.position byte-buffer)) + "bytes-to-array returns the ByteBuffer with its original position") + (is (= 19 (.limit byte-buffer)) + "bytes-to-array returns the ByteBuffer with its original limit"))) From e318f799573e93b377450514d5280d918283f245 Mon Sep 17 00:00:00 2001 From: "Steve M. Kim" Date: Fri, 8 May 2015 15:07:38 -0700 Subject: [PATCH 2/4] Elaborate comment that explains the fix for issue #17 --- src/io/mandoline/impl.clj | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/io/mandoline/impl.clj b/src/io/mandoline/impl.clj index 01b37f6..f25900d 100644 --- a/src/io/mandoline/impl.clj +++ b/src/io/mandoline/impl.clj @@ -189,7 +189,13 @@ (let [p (.position byte-buffer) r (.remaining byte-buffer) ; If the ByteBuffer is not perfectly aligned with its - ; underlying byte[] storage, make a copy + ; underlying byte[] storage, make a copy. This copy defends + ; against a bug in the ucar.ma2.Array/factory method, which + ; does not honor the position and limit of the ByteBuffer and + ; reads out-of-range bytes in the underlying byte[] when the + ; data type is DataType/BYTE. + ; TODO: Fix the bug in upstream ucar netCDF library so that we + ; don't need this copy overhead. bb (if-not (and (= 0 (.arrayOffset byte-buffer) p) (= (alength (.array byte-buffer)) r)) (let [dst (byte-array r)] From c52226d6dc934aa0e14e0aed469b2f33099ab648 Mon Sep 17 00:00:00 2001 From: "Steve M. Kim" Date: Mon, 11 May 2015 09:48:39 -0700 Subject: [PATCH 3/4] Respond to code review comments for pull request #18 --- src/io/mandoline/impl.clj | 2 +- test/io/mandoline/impl_test.clj | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/io/mandoline/impl.clj b/src/io/mandoline/impl.clj index f25900d..7f2052c 100644 --- a/src/io/mandoline/impl.clj +++ b/src/io/mandoline/impl.clj @@ -197,7 +197,7 @@ ; TODO: Fix the bug in upstream ucar netCDF library so that we ; don't need this copy overhead. bb (if-not (and (= 0 (.arrayOffset byte-buffer) p) - (= (alength (.array byte-buffer)) r)) + (= (.capacity byte-buffer) (.limit byte-buffer) r)) (let [dst (byte-array r)] (.get byte-buffer dst) (ByteBuffer/wrap dst)) diff --git a/test/io/mandoline/impl_test.clj b/test/io/mandoline/impl_test.clj index 736b903..a3eb264 100644 --- a/test/io/mandoline/impl_test.clj +++ b/test/io/mandoline/impl_test.clj @@ -65,6 +65,6 @@ (is (= (take (- 19 3) (drop (+ 4 3) elements)) (seq (.copyTo1DJavaArray array))) "bytes-to-array returns an Array that contains the expected elements") (is (= 3 (.position byte-buffer)) - "bytes-to-array returns the ByteBuffer with its original position") + "bytes-to-array does not change the position of the original ByteBuffer") (is (= 19 (.limit byte-buffer)) - "bytes-to-array returns the ByteBuffer with its original limit"))) + "bytes-to-array does not change the limit of the original ByteBuffer"))) From a80d21b5d1204fbe322493c7efe51daebd188286 Mon Sep 17 00:00:00 2001 From: "Steve M. Kim" Date: Mon, 11 May 2015 09:53:56 -0700 Subject: [PATCH 4/4] Bump version to 0.1.9 --- project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.clj b/project.clj index 71a2ec3..68fc082 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject io.mandoline/mandoline-core "0.1.8" +(defproject io.mandoline/mandoline-core "0.1.9" :description "Mandoline is a distributed store for multi-dimensional arrays" :license {:name "Apache License, version 2.0"