clean code

BirkhoffG · BirkhoffG · commit 65a777e06abe · 2023-11-28T00:41:56.000-05:00
diff --git a/jax_dataloader/datasets.py b/jax_dataloader/datasets.py
@@ -31,13 +31,13 @@ def __init__(
     ):
         assert all(arrays[0].shape[0] == arr.shape[0] for arr in arrays), \
             "All arrays must have the same dimension."
-        self.arrays = arrays
+        self.arrays = tuple(arrays)
 
     def __len__(self):
         return self.arrays[0].shape[0]
 
     def __getitem__(self, index):
-        return tuple(arr[index] for arr in self.arrays)
+        return jax.tree_util.tree_map(lambda x: x[index], self.arrays)
     
     def to_tf_dataset(self):
         return tf.data.Dataset.from_tensor_slices(self.arrays)
diff --git a/jax_dataloader/loaders.py b/jax_dataloader/loaders.py
@@ -9,7 +9,7 @@
 # %% auto 0
 __all__ = ['BaseDataLoader', 'DataLoaderJax', 'DataLoaderPytorch', 'to_tf_dataset', 'DataLoaderTensorflow']
 
-# %% ../nbs/loader.ipynb 5
+# %% ../nbs/loader.ipynb 6
 class BaseDataLoader:
     """Dataloader Interface"""
     
@@ -32,7 +32,7 @@ def __next__(self):
     def __iter__(self):
         raise NotImplementedError
 
-# %% ../nbs/loader.ipynb 7
+# %% ../nbs/loader.ipynb 8
 class DataLoaderJax(BaseDataLoader):
     """Dataloder in Vanilla Jax"""
 
@@ -56,6 +56,8 @@ def __init__(
         self.pose = 0  # record the current position in the dataset
         self._shuffle()
 
+        self.num_batches = len(self)
+
     def _shuffle(self):
         if self.shuffle:
             self.indices = jax.random.permutation(next(self.keys), self.indices)
@@ -64,7 +66,7 @@ def _stop_iteration(self):
         self.pose = 0
         self._shuffle()
         raise StopIteration
-
+    
     def __len__(self):
         if self.drop_last:
             batches = len(self.dataset) // self.batch_size  # get the floor of division
@@ -73,23 +75,19 @@ def __len__(self):
         return batches
 
     def __next__(self):
-        if self.pose + self.batch_size <= self.data_len:
-            batch_indices = self.indices[self.pose: self.pose + self.batch_size]
-            batch_data = self.dataset[batch_indices]
-            self.pose += self.batch_size
-            return batch_data
-        elif self.pose < self.data_len and not self.drop_last:
-            batch_indices = self.indices[self.pose:]
+        if self.pose < self.num_batches:
+            batch_indices = self.indices[self.pose * self.batch_size: (self.pose + 1) * self.batch_size]
             batch_data = self.dataset[batch_indices]
-            self.pose += self.batch_size
+            self.pose += 1
             return batch_data
         else:
             self._stop_iteration()
 
     def __iter__(self):
         return self
 
-# %% ../nbs/loader.ipynb 10
+
+# %% ../nbs/loader.ipynb 14
 # adapted from https://jax.readthedocs.io/en/latest/notebooks/Neural_Network_and_Data_Loading.html
 def _numpy_collate(batch):
     if isinstance(batch[0], (np.ndarray, jax.Array)):
@@ -110,7 +108,7 @@ def __getitem__(self, idx): return self.dataset[idx]
     
     return DatasetPytorch(dataset)
 
-# %% ../nbs/loader.ipynb 11
+# %% ../nbs/loader.ipynb 15
 class DataLoaderPytorch(BaseDataLoader):
     """Pytorch Dataloader"""
     def __init__(
@@ -151,7 +149,7 @@ def __next__(self):
     def __iter__(self):
         return self.dataloader.__iter__()
 
-# %% ../nbs/loader.ipynb 14
+# %% ../nbs/loader.ipynb 18
 def to_tf_dataset(dataset) -> tf.data.Dataset:
     if is_tf_dataset(dataset):
         return dataset
@@ -162,7 +160,7 @@ def to_tf_dataset(dataset) -> tf.data.Dataset:
     else:
         raise ValueError(f"Dataset type {type(dataset)} is not supported.")
 
-# %% ../nbs/loader.ipynb 15
+# %% ../nbs/loader.ipynb 19
 class DataLoaderTensorflow(BaseDataLoader):
     """Tensorflow Dataloader"""
     def __init__(
diff --git a/nbs/dataset.ipynb b/nbs/dataset.ipynb
@@ -36,7 +36,18 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-11-28 00:38:49.613518: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-28 00:38:49.613568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-28 00:38:49.614328: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-28 00:38:50.239626: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+     ]
+    }
+   ],
    "source": [
     "#| export\n",
     "from __future__ import print_function, division, annotations\n",
@@ -80,13 +91,13 @@
     "    ):\n",
     "        assert all(arrays[0].shape[0] == arr.shape[0] for arr in arrays), \\\n",
     "            \"All arrays must have the same dimension.\"\n",
-    "        self.arrays = arrays\n",
+    "        self.arrays = tuple(arrays)\n",
     "\n",
     "    def __len__(self):\n",
     "        return self.arrays[0].shape[0]\n",
     "\n",
     "    def __getitem__(self, index):\n",
-    "        return tuple(arr[index] for arr in self.arrays)\n",
+    "        return jax.tree_util.tree_map(lambda x: x[index], self.arrays)\n",
     "    \n",
     "    def to_tf_dataset(self):\n",
     "        return tf.data.Dataset.from_tensor_slices(self.arrays)"
@@ -109,7 +120,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "WARNING:jax._src.lib.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)\n"
+      "An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.\n"
      ]
     }
    ],
diff --git a/nbs/loader.ipynb b/nbs/loader.ipynb
@@ -41,9 +41,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-04-05 18:20:59.105985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:\n",
-      "2023-04-05 18:20:59.106076: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:\n",
-      "2023-04-05 18:20:59.106084: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+      "2023-11-28 00:39:22.955810: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-11-28 00:39:22.955851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-11-28 00:39:22.956517: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-11-28 00:39:23.532258: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
@@ -120,6 +121,77 @@
     "        assert len(_X) == len(X_list) * batch_size\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "def test_keras_dataloader(samples=1000, batch_size=12):\n",
+    "    from keras.trainers.epoch_iterator import EpochIterator\n",
+    "\n",
+    "    feats = jnp.arange(samples).repeat(10).reshape(samples, 10)\n",
+    "    labels = jnp.arange(samples).reshape(samples, 1)\n",
+    "    ds = ArrayDataset(feats, labels)\n",
+    "    # N % batchsize != 0\n",
+    "    dl = EpochIterator(feats, labels, batch_size=batch_size, shuffle=False)\n",
+    "    for _ in range(2):\n",
+    "        X_list, Y_list = [], []\n",
+    "        for step, batch in dl.enumerate_epoch('np'):\n",
+    "            x, y = batch[0]\n",
+    "            X_list.append(x)\n",
+    "            Y_list.append(y)\n",
+    "        _X, _Y = map(jnp.concatenate, (X_list, Y_list))\n",
+    "        assert jnp.array_equal(_X, feats)\n",
+    "        assert jnp.array_equal(_Y, labels)\n",
+    "\n",
+    "    dl = EpochIterator(feats, labels, batch_size=batch_size, shuffle=False, )\n",
+    "    for _ in range(2):\n",
+    "        X_list, Y_list = [], []\n",
+    "        for step, batch in dl.enumerate_epoch('np'):\n",
+    "            x, y = batch[0]\n",
+    "            X_list.append(x)\n",
+    "            Y_list.append(y)\n",
+    "        _X, _Y = map(jnp.concatenate, (X_list, Y_list))\n",
+    "        last_idx = len(X_list) * batch_size\n",
+    "        jnp.array_equal(_X, feats[: last_idx])\n",
+    "        jnp.array_equal(_Y, labels[: last_idx])\n",
+    "\n",
+    "\n",
+    "    dl_shuffle = EpochIterator(feats, labels, batch_size=batch_size, shuffle=True, )\n",
+    "    last_X, last_Y = jnp.array([]), jnp.array([])\n",
+    "    for _ in range(2):\n",
+    "        X_list, Y_list = [], []\n",
+    "        for step, batch in dl_shuffle.enumerate_epoch('np'):\n",
+    "            x, y = batch[0]\n",
+    "            assert jnp.array_equal(x[:, :1], y)\n",
+    "            X_list.append(x)\n",
+    "            Y_list.append(y)\n",
+    "        _X, _Y = map(jnp.concatenate, (X_list, Y_list))\n",
+    "        not jnp.array_equal(_X, feats)\n",
+    "        not jnp.array_equal(_Y, labels)\n",
+    "        jnp.sum(_X) == jnp.sum(feats), \\\n",
+    "            f\"jnp.sum(_X)={jnp.sum(_X)}, jnp.sum(feats)={jnp.sum(feats)}\"\n",
+    "        not jnp.array_equal(_X, last_X)\n",
+    "        not jnp.array_equal(_Y, last_Y)\n",
+    "        last_X, last_Y = _X, _Y\n",
+    "\n",
+    "\n",
+    "    dl_shuffle = EpochIterator(feats, labels, batch_size=batch_size, shuffle=True, )\n",
+    "    for _ in range(2):\n",
+    "        X_list, Y_list = [], []\n",
+    "        for step, batch in dl_shuffle.enumerate_epoch('np'):\n",
+    "            x, y = batch[0]\n",
+    "            assert jnp.array_equal(x[:, :1], y)\n",
+    "            X_list.append(x)\n",
+    "            Y_list.append(y)\n",
+    "        _X, _Y = map(jnp.concatenate, (X_list, Y_list))\n",
+    "        not jnp.array_equal(_X, feats)\n",
+    "        not jnp.array_equal(_Y, labels)\n",
+    "        len(_X) == len(X_list) * batch_size\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -188,6 +260,8 @@
     "        self.pose = 0  # record the current position in the dataset\n",
     "        self._shuffle()\n",
     "\n",
+    "        self.num_batches = len(self)\n",
+    "\n",
     "    def _shuffle(self):\n",
     "        if self.shuffle:\n",
     "            self.indices = jax.random.permutation(next(self.keys), self.indices)\n",
@@ -196,7 +270,7 @@
     "        self.pose = 0\n",
     "        self._shuffle()\n",
     "        raise StopIteration\n",
-    "\n",
+    "    \n",
     "    def __len__(self):\n",
     "        if self.drop_last:\n",
     "            batches = len(self.dataset) // self.batch_size  # get the floor of division\n",
@@ -205,33 +279,87 @@
     "        return batches\n",
     "\n",
     "    def __next__(self):\n",
-    "        if self.pose + self.batch_size <= self.data_len:\n",
-    "            batch_indices = self.indices[self.pose: self.pose + self.batch_size]\n",
+    "        if self.pose < self.num_batches:\n",
+    "            batch_indices = self.indices[self.pose * self.batch_size: (self.pose + 1) * self.batch_size]\n",
     "            batch_data = self.dataset[batch_indices]\n",
-    "            self.pose += self.batch_size\n",
-    "            return batch_data\n",
-    "        elif self.pose < self.data_len and not self.drop_last:\n",
-    "            batch_indices = self.indices[self.pose:]\n",
-    "            batch_data = self.dataset[batch_indices]\n",
-    "            self.pose += self.batch_size\n",
+    "            self.pose += 1\n",
     "            return batch_data\n",
     "        else:\n",
     "            self._stop_iteration()\n",
     "\n",
     "    def __iter__(self):\n",
-    "        return self"
+    "        return self\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.\n"
+     ]
+    }
+   ],
    "source": [
     "#| hide\n",
     "test_dataloader(DataLoaderJax, samples=20, batch_size=12)\n",
     "test_dataloader(DataLoaderJax, samples=20, batch_size=10)\n",
-    "test_dataloader(DataLoaderJax, samples=11, batch_size=10)"
+    "test_dataloader(DataLoaderJax, samples=11, batch_size=10)\n",
+    "test_dataloader(DataLoaderJax, samples=40, batch_size=12)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "test_keras_dataloader(samples=20, batch_size=12)\n",
+    "test_keras_dataloader(samples=20, batch_size=10)\n",
+    "test_keras_dataloader(samples=11, batch_size=10)\n",
+    "test_keras_dataloader(samples=40, batch_size=12)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.48 s ± 29.8 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n 5 -r 3\n",
+    "test_dataloader(DataLoaderJax, samples=1280, batch_size=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "301 ms ± 2.4 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#| hide\n",
+    "%%timeit -n 5 -r 3\n",
+    "test_keras_dataloader(samples=1280, batch_size=10)"
    ]
   },
   {