From 237fb6d9fca02582c249f0cf87b836f3f7aa89a7 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sun, 4 Aug 2024 13:34:04 -0400
Subject: [PATCH 01/13] DeepNVMe examples

---
 .../file_access/aio_read_into_gpu_tensor.py   | 23 +++++++++++++++++++
 .../aio_read_into_pinned_tensor.py            | 22 ++++++++++++++++++
 deepnvme/file_access/aio_read_into_tensor.py  | 21 +++++++++++++++++
 .../gds_read_into_pinned_tensor.py            | 23 +++++++++++++++++++
 deepnvme/file_access/py_read_into_tensor.py   | 17 ++++++++++++++
 5 files changed, 106 insertions(+)
 create mode 100644 deepnvme/file_access/aio_read_into_gpu_tensor.py
 create mode 100644 deepnvme/file_access/aio_read_into_pinned_tensor.py
 create mode 100644 deepnvme/file_access/aio_read_into_tensor.py
 create mode 100644 deepnvme/file_access/gds_read_into_pinned_tensor.py
 create mode 100644 deepnvme/file_access/py_read_into_tensor.py

diff --git a/deepnvme/file_access/aio_read_into_gpu_tensor.py b/deepnvme/file_access/aio_read_into_gpu_tensor.py
new file mode 100644
index 000000000..db5b265a4
--- /dev/null
+++ b/deepnvme/file_access/aio_read_into_gpu_tensor.py
@@ -0,0 +1,23 @@
+import torch
+import os
+import timeit
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+
+input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
+aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+def aio_bounce_read(inp_f, h, t):
+    read_status = h.sync_pread(t, inp_f)
+    gpu_tensor = t.cuda()
+    # print(f'{read_status=}')
+
+
+if __name__ == "__main__":
+    cnt = 3
+    file_sz = os.path.getsize(input_file)
+    aio_bounce_t = timeit.timeit('aio_bounce_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_bounce_read", globals=globals(), number=cnt)
+    aio_bounce_gbs = (cnt*file_sz)/aio_bounce_t/1e9
+    print(f'aio_bounce: {aio_bounce_gbs:5.2f} GB/sec, {aio_bounce_t:5.2f} secs')
diff --git a/deepnvme/file_access/aio_read_into_pinned_tensor.py b/deepnvme/file_access/aio_read_into_pinned_tensor.py
new file mode 100644
index 000000000..4e7a83935
--- /dev/null
+++ b/deepnvme/file_access/aio_read_into_pinned_tensor.py
@@ -0,0 +1,22 @@
+import torch
+import os
+import timeit
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+
+input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
+aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+def aio_read(inp_f, h, t):
+    read_status = h.sync_pread(t, inp_f)
+    # print(f'{read_status=}')
+
+
+if __name__ == "__main__":
+    cnt = 3
+    file_sz = os.path.getsize(input_file)
+    aio_pinned_t = timeit.timeit('aio_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_read", globals=globals(), number=cnt)
+    aio_pinned_gbs = (cnt*file_sz)/aio_pinned_t/1e9
+    print(f'aio_pinned: {aio_pinned_gbs:5.2f} GB/sec, {aio_pinned_t:5.2f} secs')
diff --git a/deepnvme/file_access/aio_read_into_tensor.py b/deepnvme/file_access/aio_read_into_tensor.py
new file mode 100644
index 000000000..522fd4ce1
--- /dev/null
+++ b/deepnvme/file_access/aio_read_into_tensor.py
@@ -0,0 +1,21 @@
+import torch
+import os
+import timeit
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+
+input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
+aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8)
+
+def aio_read(inp_f, h, t):
+    read_status = h.sync_pread(t, inp_f)
+    # print(f'{read_status=}')
+
+if __name__ == "__main__":
+    cnt = 3
+    file_sz = os.path.getsize(input_file)
+    aio_t = timeit.timeit('aio_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_read", globals=globals(), number=cnt)
+    aio_gbs = (cnt*file_sz)/aio_t/1e9    
+    print(f'aio: {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
diff --git a/deepnvme/file_access/gds_read_into_pinned_tensor.py b/deepnvme/file_access/gds_read_into_pinned_tensor.py
new file mode 100644
index 000000000..bc0064ddc
--- /dev/null
+++ b/deepnvme/file_access/gds_read_into_pinned_tensor.py
@@ -0,0 +1,23 @@
+import torch
+import os
+import timeit
+import deepspeed
+from deepspeed.ops.op_builder import GDSBuilder
+
+
+input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
+gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+gds_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8, device='cuda', requires_grad=False)
+
+def gds_read(inp_f, h, t):
+    read_status = h.sync_pread(t, inp_f)
+    # print(f'{read_status=}')
+
+if __name__ == "__main__":
+    cnt = 3
+    file_sz = os.path.getsize(input_file)
+    gds_handle.new_device_locked_tensor(gds_buffer)
+    gds_pinned_t = timeit.timeit('gds_read(input_file, gds_handle, gds_buffer)', setup="from __main__ import gds_read", globals=globals(), number=cnt)
+    gds_handle.free_device_locked_tensor(gds_buffer)
+    gds_pinned_gbs = (cnt*file_sz)/gds_pinned_t/1e9
+    print(f'gds_pinned: {gds_pinned_gbs:5.2f} GB/sec, {gds_pinned_t:5.2f} secs')
diff --git a/deepnvme/file_access/py_read_into_tensor.py b/deepnvme/file_access/py_read_into_tensor.py
new file mode 100644
index 000000000..d0137f07b
--- /dev/null
+++ b/deepnvme/file_access/py_read_into_tensor.py
@@ -0,0 +1,17 @@
+import torch
+import os
+import timeit
+
+input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
+
+def py_read(inp_f):
+    with open(inp_f, 'rb') as f:
+       t = torch.frombuffer(f.read(), dtype=torch.uint8)
+    # print(f'{t.size()=}')
+
+if __name__ == "__main__":
+    cnt = 3
+    file_sz = os.path.getsize(input_file)
+    py_t = timeit.timeit('py_read(input_file)', setup="from __main__ import py_read", globals=globals(), number=cnt)
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')

From be6df576084b523b94e158c66f7ff22e5a8d7385 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 08:44:22 -0400
Subject: [PATCH 02/13] Update files

---
 .../file_access/aio_read_into_gpu_tensor.py   | 23 -------------------
 .../aio_read_into_pinned_tensor.py            | 22 ------------------
 deepnvme/file_access/aio_read_into_tensor.py  | 21 -----------------
 .../gds_read_into_pinned_tensor.py            | 23 -------------------
 deepnvme/file_access/py_read_into_tensor.py   | 17 --------------
 5 files changed, 106 deletions(-)
 delete mode 100644 deepnvme/file_access/aio_read_into_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/aio_read_into_pinned_tensor.py
 delete mode 100644 deepnvme/file_access/aio_read_into_tensor.py
 delete mode 100644 deepnvme/file_access/gds_read_into_pinned_tensor.py
 delete mode 100644 deepnvme/file_access/py_read_into_tensor.py

diff --git a/deepnvme/file_access/aio_read_into_gpu_tensor.py b/deepnvme/file_access/aio_read_into_gpu_tensor.py
deleted file mode 100644
index db5b265a4..000000000
--- a/deepnvme/file_access/aio_read_into_gpu_tensor.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import torch
-import os
-import timeit
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-
-input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
-aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
-aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
-
-def aio_bounce_read(inp_f, h, t):
-    read_status = h.sync_pread(t, inp_f)
-    gpu_tensor = t.cuda()
-    # print(f'{read_status=}')
-
-
-if __name__ == "__main__":
-    cnt = 3
-    file_sz = os.path.getsize(input_file)
-    aio_bounce_t = timeit.timeit('aio_bounce_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_bounce_read", globals=globals(), number=cnt)
-    aio_bounce_gbs = (cnt*file_sz)/aio_bounce_t/1e9
-    print(f'aio_bounce: {aio_bounce_gbs:5.2f} GB/sec, {aio_bounce_t:5.2f} secs')
diff --git a/deepnvme/file_access/aio_read_into_pinned_tensor.py b/deepnvme/file_access/aio_read_into_pinned_tensor.py
deleted file mode 100644
index 4e7a83935..000000000
--- a/deepnvme/file_access/aio_read_into_pinned_tensor.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-import os
-import timeit
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-
-input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
-aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
-aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
-
-def aio_read(inp_f, h, t):
-    read_status = h.sync_pread(t, inp_f)
-    # print(f'{read_status=}')
-
-
-if __name__ == "__main__":
-    cnt = 3
-    file_sz = os.path.getsize(input_file)
-    aio_pinned_t = timeit.timeit('aio_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_read", globals=globals(), number=cnt)
-    aio_pinned_gbs = (cnt*file_sz)/aio_pinned_t/1e9
-    print(f'aio_pinned: {aio_pinned_gbs:5.2f} GB/sec, {aio_pinned_t:5.2f} secs')
diff --git a/deepnvme/file_access/aio_read_into_tensor.py b/deepnvme/file_access/aio_read_into_tensor.py
deleted file mode 100644
index 522fd4ce1..000000000
--- a/deepnvme/file_access/aio_read_into_tensor.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-import os
-import timeit
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-
-input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
-aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
-aio_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8)
-
-def aio_read(inp_f, h, t):
-    read_status = h.sync_pread(t, inp_f)
-    # print(f'{read_status=}')
-
-if __name__ == "__main__":
-    cnt = 3
-    file_sz = os.path.getsize(input_file)
-    aio_t = timeit.timeit('aio_read(input_file, aio_handle, aio_buffer)', setup="from __main__ import aio_read", globals=globals(), number=cnt)
-    aio_gbs = (cnt*file_sz)/aio_t/1e9    
-    print(f'aio: {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
diff --git a/deepnvme/file_access/gds_read_into_pinned_tensor.py b/deepnvme/file_access/gds_read_into_pinned_tensor.py
deleted file mode 100644
index bc0064ddc..000000000
--- a/deepnvme/file_access/gds_read_into_pinned_tensor.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import torch
-import os
-import timeit
-import deepspeed
-from deepspeed.ops.op_builder import GDSBuilder
-
-
-input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
-gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
-gds_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8, device='cuda', requires_grad=False)
-
-def gds_read(inp_f, h, t):
-    read_status = h.sync_pread(t, inp_f)
-    # print(f'{read_status=}')
-
-if __name__ == "__main__":
-    cnt = 3
-    file_sz = os.path.getsize(input_file)
-    gds_handle.new_device_locked_tensor(gds_buffer)
-    gds_pinned_t = timeit.timeit('gds_read(input_file, gds_handle, gds_buffer)', setup="from __main__ import gds_read", globals=globals(), number=cnt)
-    gds_handle.free_device_locked_tensor(gds_buffer)
-    gds_pinned_gbs = (cnt*file_sz)/gds_pinned_t/1e9
-    print(f'gds_pinned: {gds_pinned_gbs:5.2f} GB/sec, {gds_pinned_t:5.2f} secs')
diff --git a/deepnvme/file_access/py_read_into_tensor.py b/deepnvme/file_access/py_read_into_tensor.py
deleted file mode 100644
index d0137f07b..000000000
--- a/deepnvme/file_access/py_read_into_tensor.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import torch
-import os
-import timeit
-
-input_file = os.path.join("/mnt", "nvme03", "aio", "test_1GB.pt")
-
-def py_read(inp_f):
-    with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8)
-    # print(f'{t.size()=}')
-
-if __name__ == "__main__":
-    cnt = 3
-    file_sz = os.path.getsize(input_file)
-    py_t = timeit.timeit('py_read(input_file)', setup="from __main__ import py_read", globals=globals(), number=cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')

From eb0e5dae21a715acd3ec313f6d05a1e6b94f2c8d Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 08:45:05 -0400
Subject: [PATCH 03/13] Rewrite examples

---
 .../bounce_read_into_cpu_tensor.py            | 26 ++++++++++++
 .../bounce_read_into_gpu_tensor.py            | 27 ++++++++++++
 .../bounce_write_from_cpu_tensor.py           | 33 +++++++++++++++
 .../bounce_write_from_gpu_tensor.py           | 31 ++++++++++++++
 .../file_access/gds_read_into_gpu_tensor.py   | 27 ++++++++++++
 .../file_access/gds_write_from_gpu_tensor.py  | 31 ++++++++++++++
 .../file_access/py_read_into_cpu_tensor.py    | 20 +++++++++
 .../file_access/py_read_into_gpu_tensor.py    | 20 +++++++++
 .../file_access/py_write_from_cpu_tensor.py   | 28 +++++++++++++
 .../file_access/py_write_from_gpu_tensor.py   | 29 +++++++++++++
 deepnvme/file_access/utils.py                 | 41 +++++++++++++++++++
 11 files changed, 313 insertions(+)
 create mode 100644 deepnvme/file_access/bounce_read_into_cpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_read_into_gpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_write_from_cpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_write_from_gpu_tensor.py
 create mode 100644 deepnvme/file_access/gds_read_into_gpu_tensor.py
 create mode 100644 deepnvme/file_access/gds_write_from_gpu_tensor.py
 create mode 100644 deepnvme/file_access/py_read_into_cpu_tensor.py
 create mode 100644 deepnvme/file_access/py_read_into_gpu_tensor.py
 create mode 100644 deepnvme/file_access/py_write_from_cpu_tensor.py
 create mode 100644 deepnvme/file_access/py_write_from_gpu_tensor.py
 create mode 100644 deepnvme/file_access/utils.py

diff --git a/deepnvme/file_access/bounce_read_into_cpu_tensor.py b/deepnvme/file_access/bounce_read_into_cpu_tensor.py
new file mode 100644
index 000000000..40ad9fdb5
--- /dev/null
+++ b/deepnvme/file_access/bounce_read_into_cpu_tensor.py
@@ -0,0 +1,26 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+    read_status = h.sync_pread(bounce_buffer, inp_f)
+    t = bounce_buffer.cpu()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
+    print(f'bb read into cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_read_into_gpu_tensor.py b/deepnvme/file_access/bounce_read_into_gpu_tensor.py
new file mode 100644
index 000000000..6306fa414
--- /dev/null
+++ b/deepnvme/file_access/bounce_read_into_gpu_tensor.py
@@ -0,0 +1,27 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+    read_status = h.sync_pread(bounce_buffer, inp_f)
+    t = bounce_buffer.cuda()
+
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
+    print(f'bb read into gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_write_from_cpu_tensor.py b/deepnvme/file_access/bounce_write_from_cpu_tensor.py
new file mode 100644
index 000000000..c83832ada
--- /dev/null
+++ b/deepnvme/file_access/bounce_write_from_cpu_tensor.py
@@ -0,0 +1,33 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+    bounce_buffer.copy_(t)
+    h.sync_pwrite(bounce_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
+
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
+
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*file_sz)/bb_t/1e9
+    print(f'bb write from cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_write_from_gpu_tensor.py b/deepnvme/file_access/bounce_write_from_gpu_tensor.py
new file mode 100644
index 000000000..954e1b6f9
--- /dev/null
+++ b/deepnvme/file_access/bounce_write_from_gpu_tensor.py
@@ -0,0 +1,31 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+    bounce_buffer.copy_(t)
+    h.sync_pwrite(bounce_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*file_sz)/bb_t/1e9
+    print(f'bb write from gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/gds_read_into_gpu_tensor.py b/deepnvme/file_access/gds_read_into_gpu_tensor.py
new file mode 100644
index 000000000..46f72e632
--- /dev/null
+++ b/deepnvme/file_access/gds_read_into_gpu_tensor.py
@@ -0,0 +1,27 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import GDSBuilder
+
+def gds_read(inp_f, h, gpu_buffer):
+    read_status = h.sync_pread(gpu_buffer, inp_f)
+    t = gpu_buffer.cuda()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    file_sz = os.path.getsize(input_file)
+    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(gds_read, input_file, gds_handle, gds_buffer))
+    gds_t = t.timeit(cnt)
+    gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
+    print(f'gds read into gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/gds_write_from_gpu_tensor.py b/deepnvme/file_access/gds_write_from_gpu_tensor.py
new file mode 100644
index 000000000..aef755371
--- /dev/null
+++ b/deepnvme/file_access/gds_write_from_gpu_tensor.py
@@ -0,0 +1,31 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import GDSBuilder
+
+def gds_write(out_f, t, h, gpu_buffer):
+    gpu_buffer.copy_(t)
+    h.sync_pwrite(gpu_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(gds_write, output_file, app_tensor, gds_handle, gds_buffer))
+
+    gds_t = t.timeit(cnt)
+    gds_gbs = (cnt*file_sz)/gds_t/1e9
+    print(f'gds write from gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_read_into_cpu_tensor.py b/deepnvme/file_access/py_read_into_cpu_tensor.py
new file mode 100644
index 000000000..cb5ff0f98
--- /dev/null
+++ b/deepnvme/file_access/py_read_into_cpu_tensor.py
@@ -0,0 +1,20 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+
+def py_read(inp_f):
+    with open(inp_f, 'rb') as f:
+       t = torch.frombuffer(f.read(), dtype=torch.uint8)
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+    input_file = args.input_file
+    t = timeit.Timer(functools.partial(py_read, input_file))
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
+    print(f'py read into cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_read_into_gpu_tensor.py b/deepnvme/file_access/py_read_into_gpu_tensor.py
new file mode 100644
index 000000000..77eea7f5a
--- /dev/null
+++ b/deepnvme/file_access/py_read_into_gpu_tensor.py
@@ -0,0 +1,20 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+
+def py_read(inp_f):
+    with open(inp_f, 'rb') as f:
+       t = torch.frombuffer(f.read(), dtype=torch.uint8).cuda()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+    input_file = args.input_file
+    t = timeit.Timer(functools.partial(py_read, input_file))
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
+    print(f'py read into gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_write_from_cpu_tensor.py b/deepnvme/file_access/py_write_from_cpu_tensor.py
new file mode 100644
index 000000000..a2d88ae35
--- /dev/null
+++ b/deepnvme/file_access/py_write_from_cpu_tensor.py
@@ -0,0 +1,28 @@
+import torch
+import numpy
+import os
+import timeit, functools
+from utils import parse_write_arguments
+import pathlib
+
+def py_write(out_f, t):
+    with open(out_f, 'wb') as f:
+       f.write(t.numpy(force=True))
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(py_write, output_file, cpu_tensor))
+
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py write from cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_write_from_gpu_tensor.py b/deepnvme/file_access/py_write_from_gpu_tensor.py
new file mode 100644
index 000000000..c31f5b7af
--- /dev/null
+++ b/deepnvme/file_access/py_write_from_gpu_tensor.py
@@ -0,0 +1,29 @@
+import torch
+import numpy
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+
+def py_write(out_f, t):
+    with open(out_f, 'wb') as f:
+       f.write(t.numpy(force=True))
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(py_write, output_file, gpu_tensor))
+
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py write from gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/utils.py b/deepnvme/file_access/utils.py
new file mode 100644
index 000000000..4520ef695
--- /dev/null
+++ b/deepnvme/file_access/utils.py
@@ -0,0 +1,41 @@
+import os
+import argparse
+
+
+def parse_read_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='File to read.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    if not os.path.isfile(args.input_file):
+        print(f'Invalid input file path: {args.input_file}')
+        quit()
+
+    return args
+
+
+
+def parse_write_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Output folder for file write.')
+    parser.add_argument('--mb_size',
+                        type=int,
+                        default=None,
+                        required=True,
+                        help='Size of tensor to save in MB.')
+    
+    args = parser.parse_args()
+    print(f'args = {args}')
+    if not os.path.isdir(args.output_folder):
+        print(f'Invalid output folder path: {args.output_folder}')
+        quit()
+
+    return args

From 75cd318e4723c14a907280b4ce687ddcbabe23de Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 09:12:25 -0400
Subject: [PATCH 04/13] Add README and rename files

---
 .../bounce_read_into_cpu_tensor.py            | 26 ---------------
 .../bounce_read_into_gpu_tensor.py            | 27 ---------------
 .../bounce_write_from_cpu_tensor.py           | 33 -------------------
 .../bounce_write_from_gpu_tensor.py           | 31 -----------------
 .../file_access/gds_read_into_gpu_tensor.py   | 27 ---------------
 .../file_access/gds_write_from_gpu_tensor.py  | 31 -----------------
 .../file_access/py_read_into_cpu_tensor.py    | 20 -----------
 .../file_access/py_read_into_gpu_tensor.py    | 20 -----------
 .../file_access/py_write_from_cpu_tensor.py   | 28 ----------------
 .../file_access/py_write_from_gpu_tensor.py   | 29 ----------------
 10 files changed, 272 deletions(-)
 delete mode 100644 deepnvme/file_access/bounce_read_into_cpu_tensor.py
 delete mode 100644 deepnvme/file_access/bounce_read_into_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/bounce_write_from_cpu_tensor.py
 delete mode 100644 deepnvme/file_access/bounce_write_from_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/gds_read_into_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/gds_write_from_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/py_read_into_cpu_tensor.py
 delete mode 100644 deepnvme/file_access/py_read_into_gpu_tensor.py
 delete mode 100644 deepnvme/file_access/py_write_from_cpu_tensor.py
 delete mode 100644 deepnvme/file_access/py_write_from_gpu_tensor.py

diff --git a/deepnvme/file_access/bounce_read_into_cpu_tensor.py b/deepnvme/file_access/bounce_read_into_cpu_tensor.py
deleted file mode 100644
index 40ad9fdb5..000000000
--- a/deepnvme/file_access/bounce_read_into_cpu_tensor.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
-    read_status = h.sync_pread(bounce_buffer, inp_f)
-    t = bounce_buffer.cpu()
-
-def main():
-    cnt = 3
-    args = parse_read_arguments()
-
-    input_file = args.input_file
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
-    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
-
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bb read into cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/bounce_read_into_gpu_tensor.py b/deepnvme/file_access/bounce_read_into_gpu_tensor.py
deleted file mode 100644
index 6306fa414..000000000
--- a/deepnvme/file_access/bounce_read_into_gpu_tensor.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
-    read_status = h.sync_pread(bounce_buffer, inp_f)
-    t = bounce_buffer.cuda()
-
-
-def main():
-    cnt = 3
-    args = parse_read_arguments()
-
-    input_file = args.input_file
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
-    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
-
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bb read into gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/bounce_write_from_cpu_tensor.py b/deepnvme/file_access/bounce_write_from_cpu_tensor.py
deleted file mode 100644
index c83832ada..000000000
--- a/deepnvme/file_access/bounce_write_from_cpu_tensor.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
-    bounce_buffer.copy_(t)
-    h.sync_pwrite(bounce_buffer, out_f)
-
-def main():
-    cnt = 3
-    args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-    file_sz = args.mb_size*(1024**2)
-    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
-
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
-    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
-
-
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
-
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bb write from cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/bounce_write_from_gpu_tensor.py b/deepnvme/file_access/bounce_write_from_gpu_tensor.py
deleted file mode 100644
index 954e1b6f9..000000000
--- a/deepnvme/file_access/bounce_write_from_gpu_tensor.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder
-
-def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
-    bounce_buffer.copy_(t)
-    h.sync_pwrite(bounce_buffer, out_f)
-
-def main():
-    cnt = 3
-    args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-    file_sz = args.mb_size*(1024**2)
-    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
-
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
-    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
-
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
-
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bb write from gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/gds_read_into_gpu_tensor.py b/deepnvme/file_access/gds_read_into_gpu_tensor.py
deleted file mode 100644
index 46f72e632..000000000
--- a/deepnvme/file_access/gds_read_into_gpu_tensor.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-import deepspeed
-from deepspeed.ops.op_builder import GDSBuilder
-
-def gds_read(inp_f, h, gpu_buffer):
-    read_status = h.sync_pread(gpu_buffer, inp_f)
-    t = gpu_buffer.cuda()
-
-def main():
-    cnt = 3
-    args = parse_read_arguments()
-
-    input_file = args.input_file
-    file_sz = os.path.getsize(input_file)
-    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
-    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
-
-    t = timeit.Timer(functools.partial(gds_read, input_file, gds_handle, gds_buffer))
-    gds_t = t.timeit(cnt)
-    gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
-    print(f'gds read into gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/gds_write_from_gpu_tensor.py b/deepnvme/file_access/gds_write_from_gpu_tensor.py
deleted file mode 100644
index aef755371..000000000
--- a/deepnvme/file_access/gds_write_from_gpu_tensor.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
-from deepspeed.ops.op_builder import GDSBuilder
-
-def gds_write(out_f, t, h, gpu_buffer):
-    gpu_buffer.copy_(t)
-    h.sync_pwrite(gpu_buffer, out_f)
-
-def main():
-    cnt = 3
-    args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-    file_sz = args.mb_size*(1024**2)
-    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
-
-    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
-    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
-
-    t = timeit.Timer(functools.partial(gds_write, output_file, app_tensor, gds_handle, gds_buffer))
-
-    gds_t = t.timeit(cnt)
-    gds_gbs = (cnt*file_sz)/gds_t/1e9
-    print(f'gds write from gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/py_read_into_cpu_tensor.py b/deepnvme/file_access/py_read_into_cpu_tensor.py
deleted file mode 100644
index cb5ff0f98..000000000
--- a/deepnvme/file_access/py_read_into_cpu_tensor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-
-def py_read(inp_f):
-    with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8)
-
-def main():
-    cnt = 3
-    args = parse_read_arguments()
-    input_file = args.input_file
-    t = timeit.Timer(functools.partial(py_read, input_file))
-    py_t = t.timeit(cnt)
-    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py read into cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/py_read_into_gpu_tensor.py b/deepnvme/file_access/py_read_into_gpu_tensor.py
deleted file mode 100644
index 77eea7f5a..000000000
--- a/deepnvme/file_access/py_read_into_gpu_tensor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-
-def py_read(inp_f):
-    with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8).cuda()
-
-def main():
-    cnt = 3
-    args = parse_read_arguments()
-    input_file = args.input_file
-    t = timeit.Timer(functools.partial(py_read, input_file))
-    py_t = t.timeit(cnt)
-    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py read into gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/py_write_from_cpu_tensor.py b/deepnvme/file_access/py_write_from_cpu_tensor.py
deleted file mode 100644
index a2d88ae35..000000000
--- a/deepnvme/file_access/py_write_from_cpu_tensor.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-import numpy
-import os
-import timeit, functools
-from utils import parse_write_arguments
-import pathlib
-
-def py_write(out_f, t):
-    with open(out_f, 'wb') as f:
-       f.write(t.numpy(force=True))
-
-def main():
-    cnt = 3
-    args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-    file_sz = args.mb_size*(1024**2)
-    cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
-
-    t = timeit.Timer(functools.partial(py_write, output_file, cpu_tensor))
-
-    py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py write from cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/file_access/py_write_from_gpu_tensor.py b/deepnvme/file_access/py_write_from_gpu_tensor.py
deleted file mode 100644
index c31f5b7af..000000000
--- a/deepnvme/file_access/py_write_from_gpu_tensor.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch
-import numpy
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-
-def py_write(out_f, t):
-    with open(out_f, 'wb') as f:
-       f.write(t.numpy(force=True))
-
-def main():
-    cnt = 3
-    args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-    file_sz = args.mb_size*(1024**2)
-    gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
-
-    t = timeit.Timer(functools.partial(py_write, output_file, gpu_tensor))
-
-    py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py write from gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
-    pathlib.Path(output_file).unlink(missing_ok=True)
-
-
-if __name__ == "__main__":
-    main()

From f0877bf8f8a7fa7c55a1151829bbdb10149d2244 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 09:12:44 -0400
Subject: [PATCH 05/13] Rename files

---
 deepnvme/file_access/README.md                | 14 ++++++++
 .../bounce_buffer_load_cpu_tensor.py          | 26 +++++++++++++++
 .../bounce_buffer_load_gpu_tensor.py          | 27 +++++++++++++++
 .../bounce_buffer_store_cpu_tensor.py         | 33 +++++++++++++++++++
 .../bounce_buffer_store_gpu_tensor.py         | 31 +++++++++++++++++
 deepnvme/file_access/gds_load_gpu_tensor.py   | 27 +++++++++++++++
 deepnvme/file_access/gds_store_gpu_tensor.py  | 31 +++++++++++++++++
 deepnvme/file_access/py_load_cpu_tensor.py    | 20 +++++++++++
 deepnvme/file_access/py_load_gpu_tensor.py    | 20 +++++++++++
 deepnvme/file_access/py_store_cpu_tensor.py   | 28 ++++++++++++++++
 deepnvme/file_access/py_store_gpu_tensor.py   | 29 ++++++++++++++++
 11 files changed, 286 insertions(+)
 create mode 100644 deepnvme/file_access/README.md
 create mode 100644 deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
 create mode 100644 deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
 create mode 100644 deepnvme/file_access/gds_load_gpu_tensor.py
 create mode 100644 deepnvme/file_access/gds_store_gpu_tensor.py
 create mode 100644 deepnvme/file_access/py_load_cpu_tensor.py
 create mode 100644 deepnvme/file_access/py_load_gpu_tensor.py
 create mode 100644 deepnvme/file_access/py_store_cpu_tensor.py
 create mode 100644 deepnvme/file_access/py_store_gpu_tensor.py

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
new file mode 100644
index 000000000..aed9b21ae
--- /dev/null
+++ b/deepnvme/file_access/README.md
@@ -0,0 +1,14 @@
+# Using DeepNVMe to implement simple file operations  CPU/GPU tensors
+
+This folder contains examples illustrating how to use DeepNVMe to implement simple file operations for moving data between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
+
+The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. 
+
+
+File Operation | Python | DeepNVMe (CPU bounce buffer) | DeepNVMe (GDS)
+|---|---|---|---|
+Load CPU tensor from file | py_load_cpu_tensor.py | bounce_buffer_load_cpu_tensor.py | - |
+Load GPU tensor from file | py_load_gpu_tensor.py | bounce_buffer_load_gpu_tensor.py | gds_load_gpu_tensor.py |
+Store CPU tensor to file | py_store_cpu_tensor.py | bounce_buffer_store_cpu_tensor.py | - |
+Store GPU tensor to file | py_store_gpu_tensor.py | bounce_buffer_store_gpu_tensor.py | gds_store_gpu_tensor.py |  
+
diff --git a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
new file mode 100644
index 000000000..40ad9fdb5
--- /dev/null
+++ b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
@@ -0,0 +1,26 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+    read_status = h.sync_pread(bounce_buffer, inp_f)
+    t = bounce_buffer.cpu()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
+    print(f'bb read into cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
new file mode 100644
index 000000000..6306fa414
--- /dev/null
+++ b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
@@ -0,0 +1,27 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+    read_status = h.sync_pread(bounce_buffer, inp_f)
+    t = bounce_buffer.cuda()
+
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
+    print(f'bb read into gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
new file mode 100644
index 000000000..c83832ada
--- /dev/null
+++ b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
@@ -0,0 +1,33 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+    bounce_buffer.copy_(t)
+    h.sync_pwrite(bounce_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
+
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
+
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*file_sz)/bb_t/1e9
+    print(f'bb write from cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
new file mode 100644
index 000000000..954e1b6f9
--- /dev/null
+++ b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
@@ -0,0 +1,31 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
+
+def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+    bounce_buffer.copy_(t)
+    h.sync_pwrite(bounce_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
+
+    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+
+    bb_t = t.timeit(cnt)
+    bb_gbs = (cnt*file_sz)/bb_t/1e9
+    print(f'bb write from gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
new file mode 100644
index 000000000..46f72e632
--- /dev/null
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -0,0 +1,27 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+import deepspeed
+from deepspeed.ops.op_builder import GDSBuilder
+
+def gds_read(inp_f, h, gpu_buffer):
+    read_status = h.sync_pread(gpu_buffer, inp_f)
+    t = gpu_buffer.cuda()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+
+    input_file = args.input_file
+    file_sz = os.path.getsize(input_file)
+    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(gds_read, input_file, gds_handle, gds_buffer))
+    gds_t = t.timeit(cnt)
+    gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
+    print(f'gds read into gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
new file mode 100644
index 000000000..aef755371
--- /dev/null
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -0,0 +1,31 @@
+import torch
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+import deepspeed
+from deepspeed.ops.op_builder import GDSBuilder
+
+def gds_write(out_f, t, h, gpu_buffer):
+    gpu_buffer.copy_(t)
+    h.sync_pwrite(gpu_buffer, out_f)
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(gds_write, output_file, app_tensor, gds_handle, gds_buffer))
+
+    gds_t = t.timeit(cnt)
+    gds_gbs = (cnt*file_sz)/gds_t/1e9
+    print(f'gds write from gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py
new file mode 100644
index 000000000..cb5ff0f98
--- /dev/null
+++ b/deepnvme/file_access/py_load_cpu_tensor.py
@@ -0,0 +1,20 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+
+def py_read(inp_f):
+    with open(inp_f, 'rb') as f:
+       t = torch.frombuffer(f.read(), dtype=torch.uint8)
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+    input_file = args.input_file
+    t = timeit.Timer(functools.partial(py_read, input_file))
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
+    print(f'py read into cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py
new file mode 100644
index 000000000..77eea7f5a
--- /dev/null
+++ b/deepnvme/file_access/py_load_gpu_tensor.py
@@ -0,0 +1,20 @@
+import torch
+import os
+import timeit, functools
+from utils import parse_read_arguments
+
+def py_read(inp_f):
+    with open(inp_f, 'rb') as f:
+       t = torch.frombuffer(f.read(), dtype=torch.uint8).cuda()
+
+def main():
+    cnt = 3
+    args = parse_read_arguments()
+    input_file = args.input_file
+    t = timeit.Timer(functools.partial(py_read, input_file))
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
+    print(f'py read into gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py
new file mode 100644
index 000000000..a2d88ae35
--- /dev/null
+++ b/deepnvme/file_access/py_store_cpu_tensor.py
@@ -0,0 +1,28 @@
+import torch
+import numpy
+import os
+import timeit, functools
+from utils import parse_write_arguments
+import pathlib
+
+def py_write(out_f, t):
+    with open(out_f, 'wb') as f:
+       f.write(t.numpy(force=True))
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(py_write, output_file, cpu_tensor))
+
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py write from cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py
new file mode 100644
index 000000000..c31f5b7af
--- /dev/null
+++ b/deepnvme/file_access/py_store_gpu_tensor.py
@@ -0,0 +1,29 @@
+import torch
+import numpy
+import os
+import timeit, functools
+import pathlib
+from utils import parse_write_arguments
+
+def py_write(out_f, t):
+    with open(out_f, 'wb') as f:
+       f.write(t.numpy(force=True))
+
+def main():
+    cnt = 3
+    args = parse_write_arguments()
+    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+    file_sz = args.mb_size*(1024**2)
+    gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+
+    t = timeit.Timer(functools.partial(py_write, output_file, gpu_tensor))
+
+    py_t = t.timeit(cnt)
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py write from gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
+
+if __name__ == "__main__":
+    main()

From e0fe1a262845e2989d7bbb068cf8ce149b21fe9a Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 5 Aug 2024 09:36:31 -0400
Subject: [PATCH 06/13] Fix naming

---
 deepnvme/file_access/README.md                         | 2 +-
 deepnvme/file_access/bounce_buffer_load_cpu_tensor.py  | 6 +++---
 deepnvme/file_access/bounce_buffer_load_gpu_tensor.py  | 6 +++---
 deepnvme/file_access/bounce_buffer_store_cpu_tensor.py | 6 +++---
 deepnvme/file_access/bounce_buffer_store_gpu_tensor.py | 6 +++---
 deepnvme/file_access/gds_load_gpu_tensor.py            | 6 +++---
 deepnvme/file_access/gds_store_gpu_tensor.py           | 6 +++---
 deepnvme/file_access/py_load_cpu_tensor.py             | 6 +++---
 deepnvme/file_access/py_load_gpu_tensor.py             | 6 +++---
 deepnvme/file_access/py_store_cpu_tensor.py            | 6 +++---
 deepnvme/file_access/py_store_gpu_tensor.py            | 6 +++---
 11 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
index aed9b21ae..895748eac 100644
--- a/deepnvme/file_access/README.md
+++ b/deepnvme/file_access/README.md
@@ -1,6 +1,6 @@
 # Using DeepNVMe to implement simple file operations  CPU/GPU tensors
 
-This folder contains examples illustrating how to use DeepNVMe to implement simple file operations for moving data between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
+This folder contains examples illustrating how to use DeepNVMe to implement simple file operations that involve moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
 
 The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. 
 
diff --git a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
index 40ad9fdb5..58c488f83 100644
--- a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
@@ -5,7 +5,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
-def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+def file_read(inp_f, h, bounce_buffer):
     read_status = h.sync_pread(bounce_buffer, inp_f)
     t = bounce_buffer.cpu()
 
@@ -17,10 +17,10 @@ def main():
     aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bb read into cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf load_cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
index 6306fa414..81ed45025 100644
--- a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
@@ -5,7 +5,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
-def aio_bounce_buffer_read(inp_f, h, bounce_buffer):
+def file_read(inp_f, h, bounce_buffer):
     read_status = h.sync_pread(bounce_buffer, inp_f)
     t = bounce_buffer.cuda()
 
@@ -18,10 +18,10 @@ def main():
     aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_read, input_file, aio_handle, bounce_buffer))
+    t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bb read into gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf load_gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
index c83832ada..b08e65149 100644
--- a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
@@ -6,7 +6,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
-def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
     h.sync_pwrite(bounce_buffer, out_f)
 
@@ -22,11 +22,11 @@ def main():
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+    t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
 
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bb write from cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf store_cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
diff --git a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
index 954e1b6f9..77e28f8e1 100644
--- a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
@@ -6,7 +6,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
-def aio_bounce_buffer_write(out_f, t, h, bounce_buffer):
+def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
     h.sync_pwrite(bounce_buffer, out_f)
 
@@ -21,11 +21,11 @@ def main():
     aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
-    t = timeit.Timer(functools.partial(aio_bounce_buffer_write, output_file, app_tensor, aio_handle, bounce_buffer))
+    t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
 
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bb write from gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf store_gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
index 46f72e632..6acc20b9b 100644
--- a/deepnvme/file_access/gds_load_gpu_tensor.py
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -5,7 +5,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import GDSBuilder
 
-def gds_read(inp_f, h, gpu_buffer):
+def file_read(inp_f, h, gpu_buffer):
     read_status = h.sync_pread(gpu_buffer, inp_f)
     t = gpu_buffer.cuda()
 
@@ -18,10 +18,10 @@ def main():
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
     gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    t = timeit.Timer(functools.partial(gds_read, input_file, gds_handle, gds_buffer))
+    t = timeit.Timer(functools.partial(file_read, input_file, gds_handle, gds_buffer))
     gds_t = t.timeit(cnt)
     gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
-    print(f'gds read into gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    print(f'gds load_gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
index aef755371..aad4bd410 100644
--- a/deepnvme/file_access/gds_store_gpu_tensor.py
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -6,7 +6,7 @@
 import deepspeed
 from deepspeed.ops.op_builder import GDSBuilder
 
-def gds_write(out_f, t, h, gpu_buffer):
+def file_write(out_f, t, h, gpu_buffer):
     gpu_buffer.copy_(t)
     h.sync_pwrite(gpu_buffer, out_f)
 
@@ -21,11 +21,11 @@ def main():
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
     gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    t = timeit.Timer(functools.partial(gds_write, output_file, app_tensor, gds_handle, gds_buffer))
+    t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, gds_handle, gds_buffer))
 
     gds_t = t.timeit(cnt)
     gds_gbs = (cnt*file_sz)/gds_t/1e9
-    print(f'gds write from gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    print(f'gds store_gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py
index cb5ff0f98..85ebc1da7 100644
--- a/deepnvme/file_access/py_load_cpu_tensor.py
+++ b/deepnvme/file_access/py_load_cpu_tensor.py
@@ -3,7 +3,7 @@
 import timeit, functools
 from utils import parse_read_arguments
 
-def py_read(inp_f):
+def file_read(inp_f):
     with open(inp_f, 'rb') as f:
        t = torch.frombuffer(f.read(), dtype=torch.uint8)
 
@@ -11,10 +11,10 @@ def main():
     cnt = 3
     args = parse_read_arguments()
     input_file = args.input_file
-    t = timeit.Timer(functools.partial(py_read, input_file))
+    t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
     py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py read into cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py load_cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py
index 77eea7f5a..fd61fce8f 100644
--- a/deepnvme/file_access/py_load_gpu_tensor.py
+++ b/deepnvme/file_access/py_load_gpu_tensor.py
@@ -3,7 +3,7 @@
 import timeit, functools
 from utils import parse_read_arguments
 
-def py_read(inp_f):
+def file_read(inp_f):
     with open(inp_f, 'rb') as f:
        t = torch.frombuffer(f.read(), dtype=torch.uint8).cuda()
 
@@ -11,10 +11,10 @@ def main():
     cnt = 3
     args = parse_read_arguments()
     input_file = args.input_file
-    t = timeit.Timer(functools.partial(py_read, input_file))
+    t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
     py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py read into gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py load_gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py
index a2d88ae35..bfe28f2d2 100644
--- a/deepnvme/file_access/py_store_cpu_tensor.py
+++ b/deepnvme/file_access/py_store_cpu_tensor.py
@@ -5,7 +5,7 @@
 from utils import parse_write_arguments
 import pathlib
 
-def py_write(out_f, t):
+def file_write(out_f, t):
     with open(out_f, 'wb') as f:
        f.write(t.numpy(force=True))
 
@@ -17,11 +17,11 @@ def main():
     file_sz = args.mb_size*(1024**2)
     cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
 
-    t = timeit.Timer(functools.partial(py_write, output_file, cpu_tensor))
+    t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor))
 
     py_t = t.timeit(cnt)
     py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py write from cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py store_cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py
index c31f5b7af..71730a934 100644
--- a/deepnvme/file_access/py_store_gpu_tensor.py
+++ b/deepnvme/file_access/py_store_gpu_tensor.py
@@ -5,7 +5,7 @@
 import pathlib
 from utils import parse_write_arguments
 
-def py_write(out_f, t):
+def file_write(out_f, t):
     with open(out_f, 'wb') as f:
        f.write(t.numpy(force=True))
 
@@ -17,11 +17,11 @@ def main():
     file_sz = args.mb_size*(1024**2)
     gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    t = timeit.Timer(functools.partial(py_write, output_file, gpu_tensor))
+    t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor))
 
     py_t = t.timeit(cnt)
     py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py write from gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py store_gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 

From 577626ff4ec37d2db4a857a3d94d657b4d12b04b Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 6 Aug 2024 11:59:44 -0400
Subject: [PATCH 07/13] Fix typo

---
 deepnvme/file_access/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
index 895748eac..7f55513c6 100644
--- a/deepnvme/file_access/README.md
+++ b/deepnvme/file_access/README.md
@@ -1,4 +1,4 @@
-# Using DeepNVMe to implement simple file operations  CPU/GPU tensors
+# Using DeepNVMe to implement simple file reads and writes of CPU/GPU tensors
 
 This folder contains examples illustrating how to use DeepNVMe to implement simple file operations that involve moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
 

From 5f9398f1b14ec7d28e4a7dc8c21f7e17e8afdde7 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 9 Aug 2024 16:33:16 -0400
Subject: [PATCH 08/13] Add bash scripts; simplify code

---
 .../bounce_buffer_load_cpu_tensor.py          | 27 +++++++++-------
 .../bounce_buffer_load_gpu_tensor.py          | 29 ++++++++++-------
 .../bounce_buffer_store_cpu_tensor.py         | 23 ++++++++-----
 .../bounce_buffer_store_gpu_tensor.py         | 27 ++++++++++------
 deepnvme/file_access/gds_load_gpu_tensor.py   | 20 +++++++-----
 deepnvme/file_access/gds_store_gpu_tensor.py  | 23 ++++++++-----
 deepnvme/file_access/py_load_cpu_tensor.py    | 12 ++++---
 deepnvme/file_access/py_load_gpu_tensor.py    | 14 ++++----
 deepnvme/file_access/py_store_cpu_tensor.py   | 12 +++----
 deepnvme/file_access/py_store_gpu_tensor.py   | 10 +++---
 deepnvme/file_access/run_load_tensor.sh       | 26 +++++++++++++++
 deepnvme/file_access/run_store_tensor.sh      | 26 +++++++++++++++
 deepnvme/file_access/utils.py                 | 32 +++++++++++++------
 13 files changed, 192 insertions(+), 89 deletions(-)
 create mode 100644 deepnvme/file_access/run_load_tensor.sh
 create mode 100644 deepnvme/file_access/run_store_tensor.sh

diff --git a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
index 58c488f83..008ff6ebb 100644
--- a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
@@ -1,26 +1,31 @@
 import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-import deepspeed
+import os, timeit, functools
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from utils import parse_read_arguments
 
 def file_read(inp_f, h, bounce_buffer):
-    read_status = h.sync_pread(bounce_buffer, inp_f)
-    t = bounce_buffer.cpu()
+    h.sync_pread(bounce_buffer, inp_f)
+    return bounce_buffer.cpu()
 
 def main():
-    cnt = 3
     args = parse_read_arguments()
-
     input_file = args.input_file
+    file_sz = os.path.getsize(input_file)
+    cnt = args.loop
+
     aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bbuf load_cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    aio_t = t.timeit(cnt)
+    aio_gbs = (cnt*file_sz)/aio_t/1e9
+    print(f'bbuf load_cpu: {file_sz/(1024**3)}GB, {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
+
+    if args.validate: 
+        from py_load_cpu_tensor import file_read as py_file_read 
+        aio_tensor = file_read(input_file, aio_handle, bounce_buffer)
+        py_tensor = py_file_read(input_file)
+        print(f'Validation success = {aio_tensor.equal(py_tensor)}')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
index 81ed45025..d49f1b06e 100644
--- a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
@@ -1,27 +1,32 @@
 import torch
-import os
-import timeit, functools
-from utils import parse_read_arguments
-import deepspeed
+import os, timeit, functools
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from utils import parse_read_arguments
 
 def file_read(inp_f, h, bounce_buffer):
-    read_status = h.sync_pread(bounce_buffer, inp_f)
-    t = bounce_buffer.cuda()
+    h.sync_pread(bounce_buffer, inp_f)
+    return bounce_buffer.cuda()
 
 
 def main():
-    cnt = 3
     args = parse_read_arguments()
-
     input_file = args.input_file
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    file_sz = os.path.getsize(input_file)
+    cnt = args.loop
+
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*os.path.getsize(input_file))/bb_t/1e9
-    print(f'bbuf load_gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    aio_t = t.timeit(cnt)
+    aio_gbs = (cnt*file_sz)/aio_t/1e9
+    print(f'bbuf load_gpu: {file_sz/(1024**3)}GB, {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
+
+    if args.validate: 
+        from py_load_cpu_tensor import file_read as py_file_read 
+        aio_tensor = file_read(input_file, aio_handle, bounce_buffer).cpu()
+        py_tensor = py_file_read(input_file)
+        print(f'Validation success = {aio_tensor.equal(py_tensor)}')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
index b08e65149..e4b374980 100644
--- a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
@@ -1,19 +1,16 @@
 import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
+import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from utils import parse_write_arguments
 
 def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
     h.sync_pwrite(bounce_buffer, out_f)
 
 def main():
-    cnt = 3
     args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    cnt = args.loop
+    output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
@@ -26,8 +23,18 @@ def main():
 
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bbuf store_cpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf store_cpu: {file_sz/(1024**3)}GB, {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+    if args.validate: 
+        import tempfile, filecmp
+        from py_store_cpu_tensor import file_write as py_file_write 
+        py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file))
+        py_file_write(py_ref_file, app_tensor)
+        filecmp.clear_cache()
+        print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }')
+
     pathlib.Path(output_file).unlink(missing_ok=True)
 
+
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
index 77e28f8e1..036326a5e 100644
--- a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
@@ -1,31 +1,40 @@
 import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
+import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from utils import parse_write_arguments
 
 def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
     h.sync_pwrite(bounce_buffer, out_f)
 
 def main():
-    cnt = 3
     args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    cnt = args.loop
+    output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
+
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
 
     bb_t = t.timeit(cnt)
     bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bbuf store_gpu: {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    print(f'bbuf store_gpu: {file_sz/(1024**3)}GB, {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+
+    if args.validate: 
+        import tempfile, filecmp
+        from py_store_cpu_tensor import file_write as py_file_write 
+        py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file))
+        py_file_write(py_ref_file, app_tensor)
+        filecmp.clear_cache()
+        print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }')
+
+    pathlib.Path(output_file).unlink(missing_ok=True)
+
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
index 6acc20b9b..a4db1a804 100644
--- a/deepnvme/file_access/gds_load_gpu_tensor.py
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -1,27 +1,31 @@
 import torch
-import os
-import timeit, functools
+import os, timeit, functools
 from utils import parse_read_arguments
-import deepspeed
 from deepspeed.ops.op_builder import GDSBuilder
 
 def file_read(inp_f, h, gpu_buffer):
-    read_status = h.sync_pread(gpu_buffer, inp_f)
-    t = gpu_buffer.cuda()
+    h.sync_pread(gpu_buffer, inp_f)
+    return gpu_buffer.cuda()
 
 def main():
-    cnt = 3
     args = parse_read_arguments()
-
     input_file = args.input_file
     file_sz = os.path.getsize(input_file)
+    cnt = args.loop
+
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
     gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
     t = timeit.Timer(functools.partial(file_read, input_file, gds_handle, gds_buffer))
     gds_t = t.timeit(cnt)
     gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
-    print(f'gds load_gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    print(f'gds load_gpu: {file_sz/(1024**3)}GB, {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+    if args.validate: 
+        from py_load_cpu_tensor import file_read as py_file_read 
+        aio_tensor = file_read(input_file, gds_handle, gds_buffer).cpu()
+        py_tensor = py_file_read(input_file)
+        print(f'Validation success = {aio_tensor.equal(py_tensor)}')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
index aad4bd410..daa85bf40 100644
--- a/deepnvme/file_access/gds_store_gpu_tensor.py
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -1,19 +1,16 @@
 import torch
-import os
-import timeit, functools
-import pathlib
-from utils import parse_write_arguments
-import deepspeed
+import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import GDSBuilder
+from utils import parse_write_arguments
 
 def file_write(out_f, t, h, gpu_buffer):
     gpu_buffer.copy_(t)
     h.sync_pwrite(gpu_buffer, out_f)
 
 def main():
-    cnt = 3
     args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    cnt = args.loop
+    output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
@@ -25,7 +22,17 @@ def main():
 
     gds_t = t.timeit(cnt)
     gds_gbs = (cnt*file_sz)/gds_t/1e9
-    print(f'gds store_gpu: {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    print(f'gds store_gpu: {file_sz/(1024**3)}GB, {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+
+    if args.validate: 
+        import tempfile, filecmp
+        from py_store_cpu_tensor import file_write as py_file_write 
+        py_ref_file = os.path.join(tempfile.gettempdir(), os.path.basename(output_file))
+        py_file_write(py_ref_file, app_tensor)
+        filecmp.clear_cache()
+        print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }')
+
+    pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py
index 85ebc1da7..b5fc91cf3 100644
--- a/deepnvme/file_access/py_load_cpu_tensor.py
+++ b/deepnvme/file_access/py_load_cpu_tensor.py
@@ -1,20 +1,22 @@
 import torch
-import os
-import timeit, functools
+import os, timeit, functools
 from utils import parse_read_arguments
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
        t = torch.frombuffer(f.read(), dtype=torch.uint8)
+    return t 
 
 def main():
-    cnt = 3
     args = parse_read_arguments()
     input_file = args.input_file
+    file_sz = os.path.getsize(input_file)
+    cnt = args.loop
+
     t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py load_cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py load_cpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py
index fd61fce8f..48e00ae3d 100644
--- a/deepnvme/file_access/py_load_gpu_tensor.py
+++ b/deepnvme/file_access/py_load_gpu_tensor.py
@@ -1,20 +1,22 @@
 import torch
-import os
-import timeit, functools
+import os, timeit, functools
 from utils import parse_read_arguments
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8).cuda()
+       t = torch.frombuffer(f.read(), dtype=torch.uint8)
+    return t.cuda()
 
 def main():
-    cnt = 3
     args = parse_read_arguments()
     input_file = args.input_file
+    file_sz = os.path.getsize(input_file)
+    cnt = args.loop
+    
     t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*os.path.getsize(input_file))/py_t/1e9
-    print(f'py load_gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/py_t/1e9
+    print(f'py load_gpu:  {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py
index bfe28f2d2..09152e19c 100644
--- a/deepnvme/file_access/py_store_cpu_tensor.py
+++ b/deepnvme/file_access/py_store_cpu_tensor.py
@@ -1,18 +1,16 @@
 import torch
-import numpy
-import os
-import timeit, functools
-from utils import parse_write_arguments
+import os, timeit, functools
 import pathlib
+from utils import parse_write_arguments
 
 def file_write(out_f, t):
     with open(out_f, 'wb') as f:
        f.write(t.numpy(force=True))
 
 def main():
-    cnt = 3
     args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    cnt = args.loop
+    output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     cpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
@@ -21,7 +19,7 @@ def main():
 
     py_t = t.timeit(cnt)
     py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py store_cpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py store_cpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py
index 71730a934..57303af77 100644
--- a/deepnvme/file_access/py_store_gpu_tensor.py
+++ b/deepnvme/file_access/py_store_gpu_tensor.py
@@ -1,7 +1,5 @@
 import torch
-import numpy
-import os
-import timeit, functools
+import os, timeit, functools
 import pathlib
 from utils import parse_write_arguments
 
@@ -10,9 +8,9 @@ def file_write(out_f, t):
        f.write(t.numpy(force=True))
 
 def main():
-    cnt = 3
     args = parse_write_arguments()
-    output_file = os.path.join(args.output_folder, f'test_ouput_{args.mb_size}MB.pt')
+    cnt = args.loop
+    output_file = os.path.join(args.nvme_folder, f'test_ouput_{args.mb_size}MB.pt')
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     gpu_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
@@ -21,7 +19,7 @@ def main():
 
     py_t = t.timeit(cnt)
     py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py store_gpu: {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    print(f'py store_gpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 
diff --git a/deepnvme/file_access/run_load_tensor.sh b/deepnvme/file_access/run_load_tensor.sh
new file mode 100644
index 000000000..ac3440782
--- /dev/null
+++ b/deepnvme/file_access/run_load_tensor.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [[ $# -ne 1 ]]; then 
+    echo "Usage: $0 <input file on nvme device>"
+    exit 1 
+fi 
+
+input_file=$1 
+if ! [[ -f "$input_file" ]]; then
+    echo "Error: $input_file does not exist"
+    exit 1 
+fi 
+
+
+echo "Running load tensor examples using $input_file"
+for f in bounce_buffer_load_cpu_tensor.py bounce_buffer_load_gpu_tensor.py \
+    gds_load_gpu_tensor.py \
+    py_load_cpu_tensor.py py_load_gpu_tensor.py; do 
+    cmd="python $f --input_file $input_file"
+    sync 
+    echo $cmd 
+    eval $cmd 
+    sleep 2
+done 
+
+
diff --git a/deepnvme/file_access/run_store_tensor.sh b/deepnvme/file_access/run_store_tensor.sh
new file mode 100644
index 000000000..220d69247
--- /dev/null
+++ b/deepnvme/file_access/run_store_tensor.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [[ $# -ne 1 ]]; then 
+    echo "Usage: $0 <output folder on nvme device>"
+    exit 1 
+fi 
+
+output_folder=$1 
+if ! [[ -d "$output_folder" ]]; then
+    echo "Error: $output_folder does not exist"
+    exit 1 
+fi 
+
+
+echo "Running store tensor examples using $output_folder"
+for f in bounce_buffer_store_cpu_tensor.py bounce_buffer_store_gpu_tensor.py \
+    gds_store_gpu_tensor.py \
+    py_store_cpu_tensor.py py_store_gpu_tensor.py; do 
+    cmd="python $f --nvme_folder $output_folder"
+    sync 
+    echo $cmd 
+    eval $cmd 
+    sleep 2
+done 
+
+
diff --git a/deepnvme/file_access/utils.py b/deepnvme/file_access/utils.py
index 4520ef695..7bab9d98c 100644
--- a/deepnvme/file_access/utils.py
+++ b/deepnvme/file_access/utils.py
@@ -1,14 +1,21 @@
 import os
 import argparse
 
-
 def parse_read_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--input_file',
                         default=None,
                         type=str,
                         required=True,
-                        help='File to read.')
+                        help='File to read, must be on NVMe device.')
+    parser.add_argument('--loop',
+                        type=int,
+                        default=3,
+                        help='The number of times to repeat the operation (default 3).')
+    parser.add_argument('--validate',
+                        action="store_true",
+                        help="Run validation step that compares tensor value against Python file read")
+    
     args = parser.parse_args()
     print(f'args = {args}')
     if not os.path.isfile(args.input_file):
@@ -21,21 +28,28 @@ def parse_read_arguments():
 
 def parse_write_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--output_folder',
+    parser.add_argument('--nvme_folder',
                         default=None,
                         type=str,
                         required=True,
-                        help='Output folder for file write.')
+                        help='NVMe folder for file write.')
     parser.add_argument('--mb_size',
                         type=int,
-                        default=None,
-                        required=True,
-                        help='Size of tensor to save in MB.')
-    
+                        default=1024,
+                        help='Size of tensor to save in MB (default 1024).')   
+    parser.add_argument('--loop',
+                        type=int,
+                        default=3,
+                        help='The number of times to repeat the operation (default 3).')
+    parser.add_argument('--validate',
+                        action="store_true",
+                        help="Run validation step that compares tensor value against Python file read")
+
     args = parser.parse_args()
     print(f'args = {args}')
-    if not os.path.isdir(args.output_folder):
+    if not os.path.isdir(args.nvme_folder):
         print(f'Invalid output folder path: {args.output_folder}')
         quit()
 
     return args
+

From ec5dfca86a186b5da78ddf409c755225b48ba2ea Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 9 Aug 2024 19:05:55 -0400
Subject: [PATCH 09/13] README doc

---
 deepnvme/file_access/README.md                | 78 ++++++++++++++++++-
 .../bounce_buffer_load_cpu_tensor.py          |  2 +-
 .../bounce_buffer_load_gpu_tensor.py          |  2 +-
 .../bounce_buffer_store_cpu_tensor.py         |  2 +-
 .../bounce_buffer_store_gpu_tensor.py         |  2 +-
 deepnvme/file_access/utils.py                 |  4 +-
 6 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
index 7f55513c6..9d3771a52 100644
--- a/deepnvme/file_access/README.md
+++ b/deepnvme/file_access/README.md
@@ -1,6 +1,6 @@
 # Using DeepNVMe to implement simple file reads and writes of CPU/GPU tensors
 
-This folder contains examples illustrating how to use DeepNVMe to implement simple file operations that involve moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
+The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
 
 The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. 
 
@@ -12,3 +12,79 @@ Load GPU tensor from file | py_load_gpu_tensor.py | bounce_buffer_load_gpu_tenso
 Store CPU tensor to file | py_store_cpu_tensor.py | bounce_buffer_store_cpu_tensor.py | - |
 Store GPU tensor to file | py_store_gpu_tensor.py | bounce_buffer_store_gpu_tensor.py | gds_store_gpu_tensor.py |  
 
+The Python implemenations are the scripts with `py_` prefix. while the DeepNVMe implemenetations are those with`bounce_buffer_` and `gds_`prefixes. 
+
+
+## Tensor Load Examples
+The tensor load example scripts share a common command-line interface, which is illustrated below using `py_read_load_cpu_tensor.py`.
+```bash
+$ python py_load_cpu_tensor.py --help
+usage: py_load_cpu_tensor.py [-h] --input_file INPUT_FILE [--loop LOOP] [--validate]
+
+options:
+  -h, --help            show this help message and exit
+  --input_file INPUT_FILE
+                        File on NVMe device that will read as input.
+  --loop LOOP           The number of times to repeat the operation (default 3).
+  --validate            Run validation step that compares tensor value against Python file read
+```
+Before running these example scripts ensure that the input file exists on an NVMe device. The `--validate` option is relevant to only the DeepNVme implementations. This option provides minimal correctness checking by comparing against a tensor loaded using Python. We also provide a bash script `run_load_tensor.sh`, which runs all the example tensor load scripts.
+
+
+## Tensor Store Examples
+The tensor store examples share a command-line interface, which is illustrated below uisng `py_store_cpu_tensor.py`
+```bash
+$ python py_store_cpu_tensor.py --help
+usage: py_store_cpu_tensor.py [-h] --nvme_folder NVME_FOLDER [--mb_size MB_SIZE] [--loop LOOP] [--validate]
+
+options:
+  -h, --help            show this help message and exit
+  --nvme_folder NVME_FOLDER
+                        NVMe folder for file write.
+  --mb_size MB_SIZE     Size of tensor to save in MB (default 1024).
+  --loop LOOP           The number of times to repeat the operation (default 3).
+  --validate            Run validation step that compares tensor value against Python file read
+
+```
+Before running these examples ensure that the output folder exists on an NVMe device and that you have write permissions. The `--validate` option is relevant to only the DeepNVMe implementations. This option provides minimal correcness checkping by comparing the output file against that created using Python. We also provide a bash script `run_store_tensor.sh`, which runs all the example tensor store scripts.  
+
+
+## Performance Advisory
+Although this folder is primarily meant to help with integrating DeepNVMe into your Deep Learning applications, the example scripts also print out performance numbers of read and write throughputs. So, we expect you will observe some performance advantage of DeepNVMe compared to Python. However, do note that it is very likely that better performance can be realized by tuning DeepNVMe for your system. Such tuning efforts will generate more optimal values for configuring the DeepNVMe handles. 
+
+For reference, DeepNVMe configuration using hardcoded constants for bounce buffer implementations is as follows:
+
+```python
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+```
+
+The corresponding DeepNVMe configuration for GDS implementations is as follows:
+
+```python
+    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+```
+
+Despite the above caveat, it seems that some performance numbers would be useful here. So, below are the results obtained for 1GB data transfers using the unmodified scripts, i,e. with untuned DeepNVMe configurations. The experiments were conducted on a Lambda RTX A6000 workstation with a single [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) that has peak sequential read and write throughputs of 5600 MB/s and 4300 MB/s respectively. The software stack included Ubuntu 22.04.4 LTS, Pytorch 2.4, and CUDA 12.1. 
+
+The performance results of the tensor load examples are presented in the table below and show ~2.5X speedup for DeepNVMe. 
+
+Tensor load script | GB/sec (1GB file read)|
+|---|---|
+py_load_cpu_tensor.py | 1.9 |
+py_load_gpu_tensor.py | 1.6 | 
+bounce_buffer_load_cpu_tensor | 4.9 | 
+bounce_buffer_load_gpu_tensor | 4.1 | 
+
+
+The performance results of the tensor store examples are presented in the table below and show 4.6X--5.8X  speedup for DeepNVMe. 
+
+Tensor store script | GB/sec (1GB file write)|
+|---|---|
+py_store_cpu_tensor.py | 0.8 |
+py_store_gpu_tensor.py | 0.6 | 
+bounce_buffer_store_cpu_tensor | 3.7 | 
+bounce_buffer_store_gpu_tensor | 3.5 | 
+
+
+# Conclusion
+We hope these example scripts help you to easily and quicly integrate DeepNVMe into your applications. 
diff --git a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
index 008ff6ebb..cd7c18b01 100644
--- a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
@@ -13,7 +13,7 @@ def main():
     file_sz = os.path.getsize(input_file)
     cnt = args.loop
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
diff --git a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
index d49f1b06e..208ef8ed9 100644
--- a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
@@ -14,7 +14,7 @@ def main():
     file_sz = os.path.getsize(input_file)
     cnt = args.loop
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
diff --git a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
index e4b374980..28ac4848d 100644
--- a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
@@ -15,7 +15,7 @@ def main():
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
diff --git a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
index 036326a5e..2cd441c04 100644
--- a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
+++ b/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
@@ -15,7 +15,7 @@ def main():
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 2)
+    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
diff --git a/deepnvme/file_access/utils.py b/deepnvme/file_access/utils.py
index 7bab9d98c..702f87337 100644
--- a/deepnvme/file_access/utils.py
+++ b/deepnvme/file_access/utils.py
@@ -7,7 +7,7 @@ def parse_read_arguments():
                         default=None,
                         type=str,
                         required=True,
-                        help='File to read, must be on NVMe device.')
+                        help='File on NVMe device that will read as input.')
     parser.add_argument('--loop',
                         type=int,
                         default=3,
@@ -32,7 +32,7 @@ def parse_write_arguments():
                         default=None,
                         type=str,
                         required=True,
-                        help='NVMe folder for file write.')
+                        help='NVMe folder that will used for file write.')
     parser.add_argument('--mb_size',
                         type=int,
                         default=1024,

From fd6086f7146dd0b8294f329d16d4ded59a18c62a Mon Sep 17 00:00:00 2001
From: Ubuntu
 <deepspeed@H100-VM2.shlnn55tgwve1eacvp21ie45dg.jx.internal.cloudapp.net>
Date: Sun, 11 Aug 2024 20:11:00 +0000
Subject: [PATCH 10/13] Update READMe and scripts; Fix GDS examples

---
 deepnvme/file_access/README.md                |  78 +++++++++++-------
 ...d_cpu_tensor.py => aio_load_cpu_tensor.py} |   6 +-
 ...d_gpu_tensor.py => aio_load_gpu_tensor.py} |   6 +-
 ..._cpu_tensor.py => aio_store_cpu_tensor.py} |   8 +-
 ..._gpu_tensor.py => aio_store_gpu_tensor.py} |   8 +-
 deepnvme/file_access/gds_load_gpu_tensor.py   |  10 ++-
 deepnvme/file_access/gds_store_gpu_tensor.py  |   9 +-
 .../file_access/media/deepnvme_ops_report.png | Bin 0 -> 8964 bytes
 deepnvme/file_access/py_load_cpu_tensor.py    |   6 +-
 deepnvme/file_access/py_load_gpu_tensor.py    |   6 +-
 deepnvme/file_access/py_store_cpu_tensor.py   |   6 +-
 deepnvme/file_access/py_store_gpu_tensor.py   |   6 +-
 deepnvme/file_access/run_load_tensor.sh       |   2 +-
 deepnvme/file_access/run_store_tensor.sh      |   2 +-
 deepnvme/file_access/utils.py                 |   2 +
 15 files changed, 89 insertions(+), 66 deletions(-)
 rename deepnvme/file_access/{bounce_buffer_load_cpu_tensor.py => aio_load_cpu_tensor.py} (83%)
 rename deepnvme/file_access/{bounce_buffer_load_gpu_tensor.py => aio_load_gpu_tensor.py} (83%)
 rename deepnvme/file_access/{bounce_buffer_store_cpu_tensor.py => aio_store_cpu_tensor.py} (86%)
 rename deepnvme/file_access/{bounce_buffer_store_gpu_tensor.py => aio_store_gpu_tensor.py} (86%)
 create mode 100644 deepnvme/file_access/media/deepnvme_ops_report.png

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
index 9d3771a52..d9452829f 100644
--- a/deepnvme/file_access/README.md
+++ b/deepnvme/file_access/README.md
@@ -1,18 +1,30 @@
-# Using DeepNVMe to implement simple file reads and writes of CPU/GPU tensors
+# Using DeepNVMe for simple file reads and writes involving CPU/GPU tensors
 
-The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer and NVIDIA GPUDirect Storage (GDS) as appropriate. 
+The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA GPUDirect Storage (GDS) as appropriate. 
 
 The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. 
 
 
-File Operation | Python | DeepNVMe (CPU bounce buffer) | DeepNVMe (GDS)
+File Operation | Python | DeepNVMe (aio) | DeepNVMe (GDS)
 |---|---|---|---|
-Load CPU tensor from file | py_load_cpu_tensor.py | bounce_buffer_load_cpu_tensor.py | - |
-Load GPU tensor from file | py_load_gpu_tensor.py | bounce_buffer_load_gpu_tensor.py | gds_load_gpu_tensor.py |
-Store CPU tensor to file | py_store_cpu_tensor.py | bounce_buffer_store_cpu_tensor.py | - |
-Store GPU tensor to file | py_store_gpu_tensor.py | bounce_buffer_store_gpu_tensor.py | gds_store_gpu_tensor.py |  
+Load CPU tensor from file | py_load_cpu_tensor.py | aio_load_cpu_tensor.py | - |
+Load GPU tensor from file | py_load_gpu_tensor.py | aio_load_gpu_tensor.py | gds_load_gpu_tensor.py |
+Store CPU tensor to file | py_store_cpu_tensor.py | aio_store_cpu_tensor.py | - |
+Store GPU tensor to file | py_store_gpu_tensor.py | aio_store_gpu_tensor.py | gds_store_gpu_tensor.py |  
+
+The Python implementations are the scripts with `py_` prefix. while the DeepNVMe implementations are those with`aio_` and `gds_`prefixes. 
+
+## Requirements 
+Ensure your environment is properly configured to run these examples. First, you need to install DeepSpeed version >= 0.15.0. Next, ensure that the DeepNVMe operators are available in the DeepSpeed installation. The `async_io` operator is required for any DeepNVMe functionality, while the `gds` operator is required only for GDS functionality. You can confirm availability of each operator by inspecting the output of `ds_report` to check that compatible status is <span style="color:green">[OKAY]</span>. Below is a snippet of `ds_report` output showing availability of both `async_io` and `gds` operators. 
+
+<div align="center">
+    <img src="./media/deepnvme_ops_report.png" style="width:6.5in;height:3.42153in" />
+</div> 
+
+<div align="center">
+    ds_report output showing availability of DeepNVMe operators (async_io and gds) in a DeepSpeed installation. 
+</div> 
 
-The Python implemenations are the scripts with `py_` prefix. while the DeepNVMe implemenetations are those with`bounce_buffer_` and `gds_`prefixes. 
 
 
 ## Tensor Load Examples
@@ -28,11 +40,11 @@ options:
   --loop LOOP           The number of times to repeat the operation (default 3).
   --validate            Run validation step that compares tensor value against Python file read
 ```
-Before running these example scripts ensure that the input file exists on an NVMe device. The `--validate` option is relevant to only the DeepNVme implementations. This option provides minimal correctness checking by comparing against a tensor loaded using Python. We also provide a bash script `run_load_tensor.sh`, which runs all the example tensor load scripts.
+Before running these example scripts ensure that the input file exists on an NVMe device. The `--validate` option is relevant only to the DeepNVme implementations. This option provides minimal correctness checking by comparing against a tensor loaded using Python. We also provide a bash script `run_load_tensor.sh`, which runs all the example tensor load scripts.
 
 
 ## Tensor Store Examples
-The tensor store examples share a command-line interface, which is illustrated below uisng `py_store_cpu_tensor.py`
+The tensor store examples share a command-line interface, which is illustrated below using `py_store_cpu_tensor.py`
 ```bash
 $ python py_store_cpu_tensor.py --help
 usage: py_store_cpu_tensor.py [-h] --nvme_folder NVME_FOLDER [--mb_size MB_SIZE] [--loop LOOP] [--validate]
@@ -46,45 +58,51 @@ options:
   --validate            Run validation step that compares tensor value against Python file read
 
 ```
-Before running these examples ensure that the output folder exists on an NVMe device and that you have write permissions. The `--validate` option is relevant to only the DeepNVMe implementations. This option provides minimal correcness checkping by comparing the output file against that created using Python. We also provide a bash script `run_store_tensor.sh`, which runs all the example tensor store scripts.  
+Before running these examples ensure that the output folder exists on an NVMe device and that you have write permission. The `--validate` option is relevant only to the DeepNVMe implementations. This option provides minimal correctness checking by comparing the output file against that created using Python. We also provide a bash script `run_store_tensor.sh`, which runs all the example tensor store scripts.  
 
 
 ## Performance Advisory
-Although this folder is primarily meant to help with integrating DeepNVMe into your Deep Learning applications, the example scripts also print out performance numbers of read and write throughputs. So, we expect you will observe some performance advantage of DeepNVMe compared to Python. However, do note that it is very likely that better performance can be realized by tuning DeepNVMe for your system. Such tuning efforts will generate more optimal values for configuring the DeepNVMe handles. 
+Although this folder is primarily meant to help with integrating DeepNVMe into your Deep Learning applications, the example scripts also print out performance numbers of read and write throughput. So, we expect you will observe some performance advantage of DeepNVMe compared to Python. However, do note that it is likely that better performance can be realized by tuning DeepNVMe for your environment. Such tuning efforts will ideally generate more optimal values for configuring DeepNVMe. 
 
-For reference, DeepNVMe configuration using hardcoded constants for bounce buffer implementations is as follows:
+For reference, DeepNVMe configuration using hard-coded constants for `aio_` implementations is as follows:
 
 ```python
     aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
 ```
 
-The corresponding DeepNVMe configuration for GDS implementations is as follows:
+The corresponding DeepNVMe configuration for `gds_` implementations is as follows:
 
 ```python
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
 ```
 
-Despite the above caveat, it seems that some performance numbers would be useful here. So, below are the results obtained for 1GB data transfers using the unmodified scripts, i,e. with untuned DeepNVMe configurations. The experiments were conducted on a Lambda RTX A6000 workstation with a single [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) that has peak sequential read and write throughputs of 5600 MB/s and 4300 MB/s respectively. The software stack included Ubuntu 22.04.4 LTS, Pytorch 2.4, and CUDA 12.1. 
+Despite the above caveat, it seems that some performance numbers would be useful here to help set the right expectations. The experiments were conducted on an Azure [NC80adis_H100_v5](https://learn.microsoft.com/en-us/azure/virtual-machines/ncads-h100-v5) series virtual machine (VM). This VM includes two 3.5TB local NVMe devices (labelled Microsoft NVMe Direct Disk v2) that we combined into a single RAID-0 volume. The software environment included Ubuntu 22.04.4 LTS, Linux kernel 6.5.0-26-generic, Pytorch 2.4, and CUDA 12.4. We ran experiments of 1GB data transfers using the unmodified scripts, i.e., without DeepNVMe tuning, and present the throughput results in the tables below. In summary, we observed that DeepNVMe significantly accelerates I/O operations compared to Python. DeepNVMe is 8-16X faster for loading tensor data, and 11X-119X faster for writing tensor data. 
+
+Load 1GB CPU tensor (1GB file read) | GB/sec | Speedup over Python | 
+|---|---|---|
+py_load_cpu_tensor.py  | 1.5 | - | 
+aio_load_cpu_tensor.py | 12.3 | 8X | 
+
+Load 1GB GPU tensor (1GB file read) | GB/sec | Speedup over Python | 
+|---|---|---|
+py_load_gpu_tensor.py | 0.7| - | 
+aio_load_gpu_tensor.py | 9.9 | 14X | 
+gds_load_gpu_tensor.py | 11.1 | 16X | 
 
-The performance results of the tensor load examples are presented in the table below and show ~2.5X speedup for DeepNVMe. 
 
-Tensor load script | GB/sec (1GB file read)|
-|---|---|
-py_load_cpu_tensor.py | 1.9 |
-py_load_gpu_tensor.py | 1.6 | 
-bounce_buffer_load_cpu_tensor | 4.9 | 
-bounce_buffer_load_gpu_tensor | 4.1 | 
+Store 1GB CPU tensor (1GB file write) | GB/sec | Speedup over Python | 
+|---|---|---|
+py_store_cpu_tensor.py  | 0.7 | - | 
+aio_store_cpu_tensor.py | 8.1 | 11X | 
 
 
-The performance results of the tensor store examples are presented in the table below and show 4.6X--5.8X  speedup for DeepNVMe. 
+Store 1GB GPU tensor (1GB file write) | GB/sec | Speedup over Python | 
+|---|---|---|
+py_store_gpu_tensor.py | 0.5 | - | 
+aio_store_gpu_tensor.py | 8.3 | 18X | 
+gds_store_gpu_tensor.py | 8.6 | 19X | 
 
-Tensor store script | GB/sec (1GB file write)|
-|---|---|
-py_store_cpu_tensor.py | 0.8 |
-py_store_gpu_tensor.py | 0.6 | 
-bounce_buffer_store_cpu_tensor | 3.7 | 
-bounce_buffer_store_gpu_tensor | 3.5 | 
 
 
 # Conclusion
-We hope these example scripts help you to easily and quicly integrate DeepNVMe into your applications. 
+We hope you find this document and example scripts useful for integrating DeepNVMe into your applications. 
diff --git a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py
similarity index 83%
rename from deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
rename to deepnvme/file_access/aio_load_cpu_tensor.py
index cd7c18b01..3fdc624ed 100644
--- a/deepnvme/file_access/bounce_buffer_load_cpu_tensor.py
+++ b/deepnvme/file_access/aio_load_cpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools
 from deepspeed.ops.op_builder import AsyncIOBuilder
-from utils import parse_read_arguments
+from utils import parse_read_arguments, GIGA_UNIT
 
 def file_read(inp_f, h, bounce_buffer):
     h.sync_pread(bounce_buffer, inp_f)
@@ -18,8 +18,8 @@ def main():
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
     aio_t = t.timeit(cnt)
-    aio_gbs = (cnt*file_sz)/aio_t/1e9
-    print(f'bbuf load_cpu: {file_sz/(1024**3)}GB, {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
+    aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
+    print(f'aio load_cpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
 
     if args.validate: 
         from py_load_cpu_tensor import file_read as py_file_read 
diff --git a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py b/deepnvme/file_access/aio_load_gpu_tensor.py
similarity index 83%
rename from deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
rename to deepnvme/file_access/aio_load_gpu_tensor.py
index 208ef8ed9..adf67c4be 100644
--- a/deepnvme/file_access/bounce_buffer_load_gpu_tensor.py
+++ b/deepnvme/file_access/aio_load_gpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools
 from deepspeed.ops.op_builder import AsyncIOBuilder
-from utils import parse_read_arguments
+from utils import parse_read_arguments, GIGA_UNIT
 
 def file_read(inp_f, h, bounce_buffer):
     h.sync_pread(bounce_buffer, inp_f)
@@ -19,8 +19,8 @@ def main():
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
     aio_t = t.timeit(cnt)
-    aio_gbs = (cnt*file_sz)/aio_t/1e9
-    print(f'bbuf load_gpu: {file_sz/(1024**3)}GB, {aio_gbs:5.2f} GB/sec, {aio_t:5.2f} secs')
+    aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
+    print(f'aio load_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
 
     if args.validate: 
         from py_load_cpu_tensor import file_read as py_file_read 
diff --git a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py
similarity index 86%
rename from deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
rename to deepnvme/file_access/aio_store_cpu_tensor.py
index 28ac4848d..90d21e7c4 100644
--- a/deepnvme/file_access/bounce_buffer_store_cpu_tensor.py
+++ b/deepnvme/file_access/aio_store_cpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import AsyncIOBuilder
-from utils import parse_write_arguments
+from utils import parse_write_arguments, GIGA_UNIT
 
 def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
@@ -21,9 +21,9 @@ def main():
 
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
 
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bbuf store_cpu: {file_sz/(1024**3)}GB, {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    aio_t = t.timeit(cnt)
+    aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
+    print(f'aio store_cpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
 
     if args.validate: 
         import tempfile, filecmp
diff --git a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py b/deepnvme/file_access/aio_store_gpu_tensor.py
similarity index 86%
rename from deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
rename to deepnvme/file_access/aio_store_gpu_tensor.py
index 2cd441c04..d67ec4ff4 100644
--- a/deepnvme/file_access/bounce_buffer_store_gpu_tensor.py
+++ b/deepnvme/file_access/aio_store_gpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import AsyncIOBuilder
-from utils import parse_write_arguments
+from utils import parse_write_arguments, GIGA_UNIT
 
 def file_write(out_f, t, h, bounce_buffer):
     bounce_buffer.copy_(t)
@@ -21,9 +21,9 @@ def main():
 
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
 
-    bb_t = t.timeit(cnt)
-    bb_gbs = (cnt*file_sz)/bb_t/1e9
-    print(f'bbuf store_gpu: {file_sz/(1024**3)}GB, {bb_gbs:5.2f} GB/sec, {bb_t:5.2f} secs')
+    aio_t = t.timeit(cnt)
+    aio_gbs = (cnt*file_sz)/GIGA_UNIT/aio_t
+    print(f'aio store_gpu: {file_sz/GIGA_UNIT} GB, {aio_t/cnt} secs, {aio_gbs:5.2f} GB/sec')
 
     if args.validate: 
         import tempfile, filecmp
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
index a4db1a804..1fd440b75 100644
--- a/deepnvme/file_access/gds_load_gpu_tensor.py
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -1,6 +1,6 @@
 import torch
 import os, timeit, functools
-from utils import parse_read_arguments
+from utils import parse_read_arguments, GIGA_UNIT
 from deepspeed.ops.op_builder import GDSBuilder
 
 def file_read(inp_f, h, gpu_buffer):
@@ -14,12 +14,12 @@ def main():
     cnt = args.loop
 
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
-    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+    gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False))
 
     t = timeit.Timer(functools.partial(file_read, input_file, gds_handle, gds_buffer))
     gds_t = t.timeit(cnt)
-    gds_gbs = (cnt*os.path.getsize(input_file))/gds_t/1e9
-    print(f'gds load_gpu: {file_sz/(1024**3)}GB, {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    gds_gbs = (cnt*file_sz)/GIGA_UNIT/gds_t
+    print(f'gds load_gpu: {file_sz/GIGA_UNIT} GB, {gds_t/cnt} secs, {gds_gbs:5.2f} GB/sec')
 
     if args.validate: 
         from py_load_cpu_tensor import file_read as py_file_read 
@@ -27,5 +27,7 @@ def main():
         py_tensor = py_file_read(input_file)
         print(f'Validation success = {aio_tensor.equal(py_tensor)}')
 
+    gds_handle.free_pinned_device_tensor(gds_buffer)
+
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
index daa85bf40..e369e2237 100644
--- a/deepnvme/file_access/gds_store_gpu_tensor.py
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import GDSBuilder
-from utils import parse_write_arguments
+from utils import parse_write_arguments, GIGA_UNIT
 
 def file_write(out_f, t, h, gpu_buffer):
     gpu_buffer.copy_(t)
@@ -16,13 +16,13 @@ def main():
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
     gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
-    gds_buffer = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
+    gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False))
 
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, gds_handle, gds_buffer))
 
     gds_t = t.timeit(cnt)
-    gds_gbs = (cnt*file_sz)/gds_t/1e9
-    print(f'gds store_gpu: {file_sz/(1024**3)}GB, {gds_gbs:5.2f} GB/sec, {gds_t:5.2f} secs')
+    gds_gbs = (cnt*file_sz)/GIGA_UNIT/gds_t
+    print(f'gds store_gpu: {file_sz/GIGA_UNIT} GB, {gds_t/cnt} secs, {gds_gbs:5.2f} GB/sec')
 
     if args.validate: 
         import tempfile, filecmp
@@ -32,6 +32,7 @@ def main():
         filecmp.clear_cache()
         print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }')
 
+    gds_handle.free_pinned_device_tensor(gds_buffer)
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
diff --git a/deepnvme/file_access/media/deepnvme_ops_report.png b/deepnvme/file_access/media/deepnvme_ops_report.png
new file mode 100644
index 0000000000000000000000000000000000000000..c05e9b863b77c6810be396bc382ca285d4e719b3
GIT binary patch
literal 8964
zcmaKS1yoesyFLg=DIqYZNS6pRgM<i3cMB*nbV>KnA)SJRbVxTt*WiGplr)UAAdPfN
z$p4_<_y66y?z)$?4$KVa?04^XzwtcJp758-GIt3c5@29p+?A7+RKvi)gaNOc@NNUY
zEl*B&fIpbdYBJ&&h`~o2z#D9HF(ok!jH>s9mqs|idwd63ZD$M&5cAE2DW}G?hk?P6
zA}1;K%3XgugD{3-BBOig#r@C+k}n59IgP!^BoReaMr5=19ZXH=6TWv-C*S98BcTwx
zg}+ns)KH6F%(#u9O9CBs%Y#&IKwN{3EU*$r_h4@C2@la5Mscg$7`lNwoSZdlf^*j?
znKKO+t_xnx8(wP{uU%X|IwOaK8Vc8%uZ4XE(ae-(J2Q@+N}&RW@}UetByn%$w|VJ<
zV#2b9?}^KHzO=xnXEJ_V8$@#NDI(P9bl$51xfnt<P#?dR=`~oUJxQHd#X)g%MGMdY
zPZ^;x>4PNi1SgV+GlYmM;L(#xWHSSwXaA4U$YJ7m;r9u^Z1Ssv8Ko>LnKp$>xZE5D
z-uqHDFHXLhxFUiLwC{IEQ(JG+waL0EeZO3MQ4m+r8Y`O3EG4I;6!q|hdu2u+b;Vm*
zC<vlkgg~844_c-uU-rqKn4VJwH?bUC@#PH1<>X~&w@Gpa;h7(%%CgsL8u-+Co^KR>
zg<?<2681=#eDpb)8r-HcG{FDjSjHXnc0C(#MU~HDFlDOur9in}^W?Hg6ba3YL?{vO
zitjd)5ScDxBh9j>GlPdyIe$f4I2miIwPLgsf6P^z&*>am8{)3l-1OZ|FIVV{gM(2f
zy4HD!$Rg>`8>(HMcX;9D6RLPU_H9AI;M1T>L2K*+FD6M-&Yj}|c9qbDg~!yE!LEbp
z{8qd3r~=HWs3?~0ytmiCt!5h?SNn}WD$+81fBy45^p&Qj`5KHB_m|s(Ps8S1bAvWv
zlCD#DDvxEH_E4sf4lf^H+E9m_bTi5Hk8fa|`XcR`H&ZgPnSCGF)CZ3Dx~c5H@z3oe
zZ`-4eHR{ZVXw*Y+(0nmEvUKR?JpYBdw7b4&UWC_k39ue=ytbmcdUW|Cco@-oxdOFn
z55l+F94{`Zt9z~II%QTcpd#_<cOeecXI}uF@>@Cuf$w+Vak!dtm?k$=JwlT-yhK@8
zQX*fj(=eE)Sq?KB9c&@qX!urZ8E<)6q?)TJKz6`BHa>2;N&b@0SFZa#tv!S$-GX(u
z<-%_5yeF1%HIj%{cy`?X#NVtxF~$3r850w=KKi>q7VdZ$DWf8Guq-87jf6|D*=WBR
zp&H`4bWPOb#!n?%Sz|H6aQL>#enFTqXr|tlFpqxY&pdlin?TDia<;{eIH<NA@wZ2E
zWwy)RKli3RH3|6@+;=;jN&9ntlD^esP!Oi-&{9{V(kl%eol=c5Eu$p&_^rAv8RQlc
ziJZ1!4adZ~{Y^(tGMipC)0#D`dPRngY~#1Ieq#`*WPWrsXLD;y>0#@RY=6aL5)+-v
zlDiMMByH!A6L~e54G%+!X*K$jpu3mBpDUya_NceTn(OTs?CkVN+D*1?3Te-d#TJy1
z2K0%{tsD#AxgEBDJcA#ueno_V(9;Q;`j&+p0s7575<b<><-mu{i2b*>>0^3G%#3>0
zn!1|;=&}h#TjI+CEy8Fwj+lFzlu!8x2&SX$a4Q{uNY{YcFQY!m&d5ZzL>P61Jjn0f
zCLdKEh6V~Fa0o%kJ`Tn;I1Q8*JOUp`rGCz%W^@(Tx%;uFgAJZ<e8?1e+oBhj;(K+T
zB4Ye^dIN;voDPqE8&y#+V}9vNU>AJf<>++rq|Bgo{u0_FOwvq;-U+!&HDC)l(MEZ?
znA5WCJvl@IGY;Wl5WKuzyym>~OBmgV<xo#%O&=d}7OQL!kiG?f%SdNgVSP)F=K1v*
zWc}1dy4L7gT0h9Q<JObl1KcYYhM<@%^3s;PD4;?GEfW)fZ|aW0pe#^wI>P?1@}rwe
z#Aza4@R>h0g^Od0s9!6Q;N}W-BL(VR<(A+7TDgQmggu+ebHu;y&guB%a<#Z2Q~}#U
z0dFVT&!;`sioI%kl*f$)kJFvU^Yiy*>3|}rHtwBQrdYl^CNqZ!p6HFVwcUKbz3uu0
z_B=X~oZ7kdcxW@n|7zeQTjc8X*Q>#y0oVHF<`&}|G*ehu-JGLVas9-k>#TcSu7O-Q
zOzU;NReyi^ov%YG8EQk;u~%O=#$2PRB(vOQoM@@{jrH=kiYFtTsR)q_F~3oupXP9s
zKDm@6dqo%v_PZv#<c9+An2bJJO8sx>aV6yAiy%<!L%~=K`JsksCT}H~D&<&IX};7P
zdU{&P6gn)fZJ?T+IUI0#aX6@6F++%aP@n0An9fW3xE1DZ>r&^b2)#y}O3?1|v9nL0
zj}Bg6Js@=PtW@EXRK>IXw>&dQ7W-Tfl?WDyj6Cr1{gtRv+JQ@}K2&xQB8Qb1OZ-Ne
zw)Es`$0$COcl6e#sNRr`MXhsK{K=Jk25eR+b(>eLcwS^=Wd3mSq2){8z=XBD(|8TZ
zCr8M!=|bbgjO^m{ili0S_U`4WUCB=K+`EMa^`lve(7SANIQ@09^QB#md7uSHlxJ}h
zjIQB$yL5HqxX|UHDSO238C-%qc=#C)PjXRWjN&`wYVPKen7~g*&ZQFK;%4)A>RXxf
z2*`b&t}@!Q)iotye3gwA7)NWK)GmKM4AC%&vTHa{XM0dY>2ru#1?m$i9uus6DR3fE
zELDGGtWbox`D7w?tJp<DfZF`%0G>MIA(>?jH$4vn%|SP1pHasQw7SBRg=dA#2a9Y4
zZvC#)AKtD!S(ZC4(x(x26c;MW{o!@2cJcdS^*(fz06Z@q-k`da(E2U$v@Q^6I3;z~
z=v1dET*Z&VbIXElrS2<<G#Bla($S9dn?L4$x)**LZ!+zTv-`Lae0YCtD-d(@gC?d)
z#yP@glzZyv65=vbUgz4kw0d6p+PS}XaGi4N+UG(jUZq|Age_kH*36(=CP)PPRh;ge
z^xAcmnH}6tXw(JkZ2F-Bt8B2yzA8&V6rneiU?grjme7`bDTfwNr}@nExV_K0slm9}
z>AKIgKH+LiuFQnY@c};SD#hf>xYC#RrE*`tTBQO@<y1j8XhGjwS6%so^pC#MjOYB6
zfn|4j4OXB|V29Dp9_S{&)1uv}nlB$r9`M<v_kQt0?`OJ@@|4jhQt{dGSPr+oed~ND
zs2+hB9DHpxS*E?Jw?oFF_)@)4`E~hu|MP{nyInx9Q}nU_v>%c&)|JlhJU)-|Qt~h{
z$(18J=x~9S4*8rs5j1nObIJGI-Fqr%Hj)!e`HSfV^Xo}0Hdbn|MvEUJs81>}CeyQh
zaQwp-UgMRLf);&_S+vOOnfJ%P1xlp&u%aad@T_a;d{IE!<ODl4fn-_V4v%aq>bZtO
zE`mV=xiHS$`=A&Ni!T6-Vd&id@ujq{Po6QR{h#(FKr1S@{A;Iqw`6;!p_I!0(?}lB
z7Z9#Hvz7-ql1W*`#d5DVDyhThps?DWpUpSD><H*(+$&RlC%L|1B3gA;337}dk8CpM
zjs$CH_)c~QQq!r2>#}|>JT-3+o1)O$;D$NGg6LXo*Fze+BJ5&``tN~}?sP2Pm1T<H
zk!gq4$)?AS_~1^9?spGZ;*@Z6db%Ph(~4^9>ov-?s@fq{LGb0CSn21`)=Q`3)wFX(
z9MjK!m?s)Lk0y*n==E(bSFATrD&Q`(kF15WZ|gcBGOCM8Hh+@@V}FsXIC;t5_TXy(
zi28koXoFZOXfSPCQK`5k7L48}p$R!g_D1NAWYE<LS(VS6YH!(`z1B(F&~$Oy;$Lf|
zNnarYutVb2-eQN<uY(nqd)Q&O^A!x&hqGCxbsHVBr$2c(H?)i^r9Ddtqm12&p_dpZ
zgnlwiat}n3^eU(F6xTq^Gp76mbdD-Ce0)%Hxk9qJK4PEUB(I^9#g!Zn%$oMD3ZIoO
z(q3m}sBM1PsxKzi{f2&*grjx6{T*{tM8o#khHlTv1#>Y6hq?abl6#9wB=Nv~w3q<F
zRYHPm0Iu0eF8FCeEWW!Ri^3ODJ>RH=k1E3z5iOeU3Mqa=+j|Ki*<IVuB=L!3NXmDB
zE_#H@u?LmuHdP;DlBaiez1H<RW1tO)wDY5*6PL!`_I9Vhw#1dB{uplS@%Wkj{0m;2
zZ=CU=2%@~e(Fy|*^|c#JvIcLfHvO>DE{OZSNb`D)*j2u`8$GW^3~`s9P`Pgz;${@Z
zbKuM@x5hD3U72NHx9tdyIe7^wi&Z}T-uL2hP409Yar5bhg<8o^dBIq7&QwF>By#IL
zHFu;s{7{LByJN5atdG3{F5NkGYvjn&44c79d~|;S`oa)-YtL_MyMee=rzMUHgPTdj
z`vAi0zK2sFh)LWlEVL@t9As$2j>O;FE4|86D{Qf*>{RT!60jOqt??j18@nhlg?}Od
zBPpe$9_9<5RJ~7$#U8{p3hKn6fy;el@6@32GgD~UVrS_)vh@j{L(MssSto1EGUZe2
zFDqG)`YeHbu76);Fyi3>y`DdZIf!HlU>6ZYNVl%gB<sI7=4DjUWeH-`G^VQ@=HC9{
zzVGpiDMCQbDceian`sWk&gM=4?#_&+5t_ae+0LUOyuuocJ=eU%SiGYr*RF11B-m(;
zMx+<C-;3i}pQjK?{+fV&ieOq~tHr`4;*o|btJ$}WWPDfj;`hS5b9LY}76&2VNN%Sh
z7fE{BDu9aaIF-1Feb3vCWg<vhV<yX$<)wMV)C-tx4AST%tC2LVrvQb$WcieiqBC_E
zYL~=Xwx6o*dRK!(qU$|4ny>KnH)j=oL{h(2fxy*lFDD_<n15BKkXOBxo!!n!t|$Q&
z(oITbbhbIpPr+h@l8`juI17(4kxm#>e^B4yAiyACfLy2ECWTERFO2IE>#@Xm;H~{H
z#1bVp1EQkIOz&_`%0xff0I|ilhwSL}ez$3I%K*P<CaP{5nj(~ibF_zgOaMP`1UX0)
zpf^jpie?nT0*oq*&NUC-&JH@6&@|w+KI#gHgdls$Rwf^kS_zm#B@;PI{0+bZC{lWp
zs7nCmr_R7PQbn~%E_J&vHwTuL6A68ZWI6inwIRbOc3o{Pf;JSpoj5+&E`KD0({`HY
zFb2qJ2^NvC8p@Dxern|EP(!zwXmYIdhj0OP+Rq+~Yh0_E<}7a2y0URU)T)wJZp}5f
zCj{reoNZoW8sY93elLFtxd#Fw0e7xBppC6DZyj*GH(>1kABvg$>4upC@j?n_696`r
zR?hLOyu9G2$3<bFjyld%KN69fPvN@+x0{J%RmD#cXTbAs9@pl{h@B$V+iW06Hy76O
zVdF&dH{su(QTN9&D+ri-(BFk(DPqUi#(3&LS9!#;2r0t}0cR1<HP7Zzz@A$hJ)<b-
zRn8EYaHllfEWscHH5!Y}a-w+A8>R3?2FIJBv%Qk2l7_|Kkv!;2vpL<KF~|ODi2+f!
zUVM#_1pQ?Ef|A6)z<Df(kg}H#zAJwlzFYl()6fq^0VYvEI>bH);<|Pi)gPr`LE6M{
ztJAhY(yur+>x`K0JeU+`wwjCI%R`J3l4z!$NC+%W69l-ZNselyU43$+Hn@)LXklq_
zGysQCu_>(PV@(nze9>6@nOT$QDZ;_b2$)*hFn0%?w>uuoRe_kO$P>LrSzW-V!ho;d
zX$2+5+?fB5lkcbYk^E5$<VJv%oykU6Yp6{x2CqGzW(qLUUq{N`lCvV*AOg}Ya7VB0
z@#Ab}rpNHz2x6q08IZ@G<iKm1fz=!@O*a7TywNmD#IuX4B2V?Abf8(YiL52-U+#?;
zJdJ%|J5?V{4tOGh=4LVMwZS{{37+X=Hn%%QhwU&(rG7<I3yvp0HBr>Bx1|rpoNLYr
z-O;nt6(<z)$DpY8(k%Fa?aqcFxuRa6_;jXeQZDZ9Tyq?7n+?Gfs(<%6>mV(0%zFX3
zga7z8vhC&{A~{9sJ(K+5W*hxZ{yTb$eeL3>5I#%BX>qq*d7(jkB+f9!tx94Dgnp(T
zn_kXNvFqi3@%jeoj$T;xJ{HL{0_I@+f1LnRf6&w=Ln&=*TketYbs&XXiH6PYpHP$@
zAplR~59e0_<bP0=98h2QcoJ?@nQTDN09?2qOO7tftL-2ha;$aq$2DvG`eo@6W()xf
zg<&5DeagQHV6JEJbQ2>N)3iQ?l9!j)YVUiSYXr&Lzef57v?-^vczAf=55JMbf&cp!
zh%Y|Vx7)~d-96}-A{yVo-i9leh{9s7s-$(MwnZ`iaqK98?d*=8@8_R5A2AP8U#$+L
z<eu!zS@%&<Q7KL1Plt2i+3M+KWGHxY{j+5N)R5%1J4M)tV&A2`rI?1Z$07=2d4eHX
z17Mxr52Qf$nTno5(i&%*t5?=Xa!)|?i@;+3iSsTx7RV`tHB4Y%yZX%EwH>3NrDZ;E
z1UC{c{HDxo%c-;atna^X&-1zFpu1Gh?&sQiPi{_>3SI=Mih%G0uxVxvUfRrm^04|=
zX~bgR8Ai&IK^(W7-dTP0Uo(JaabC(31WX->Q=D82B{SKfV8Hi4gd6_V#Vnpk{8hN5
zfZDSZxF&Ev^M^(5E#hv|bGywCF!2K*wElct1+*UIpVrgSKK{*^%F&OOcx`}()z^&a
z`X;N~5!_^@!|kgMXoTKe!Qs7otT$FOw;(s^;GFN=B`z5;<I5@=$(@LHZ2#<!HP%uv
zv-R8&yMy6l6i&_1%X-&sSbX=x8dxQmTc^61o9w==|Bp(Mdb5+%#vsfyaoMZS6?Y7E
zPQ*}75ILT|CYv1q;D-g#N>=onxd2q6`+QK0A0WCMrmZL{zSO#;v_w4oH<>*@AskNZ
z+b)knV#J0qrg6ShQVY5kJvHf0{-gkD>QtoY_B{DHRnpWng;%oR*7K^6mPCHs?R3s9
zEyY<2>nRksh-xP8gq>`i3j`F|Ka0{87xX@J_`y$?gS46AVwAndwlsNNQd)LH1CxCT
zM~<Av?p67ZC56#~!ZH^emA@Bae1A*0B`G#<6UnR37G3}5oG3suLqALDY~;R(@23>V
z9sdE`vk^@AgGit-bm0!UP-gnn-gAk->v?36SJBvh0O>fdL+f+TOd&0Cgdl0YdKJ5L
zLB(VYf+pQ}$a>>$TH=12v+d$Nm}H7W+jNUh^V8qb%i(nc*c}fW)*J73pBIW;755HJ
zDzVS}xa>KiuEu2H`&Ywx_kq+$egXQ~T;IdY6ZcMQwavb>?vs;hT_AKDt98Q;by-tt
zNx>lo{BzAfdt2~(6X-i0%P5oT`<fM)kOy_>4E!ka)`vxLya=>gPrWkSR+K|^A%cQ^
zd}pp%X`yl~!{+MzxZtXYGP35Pqk=;xZh4Q~?VpYc>GVLv-!Tv2_rn`AE8DxN%X_-G
zf&aiGz!#i<<=f0$q8s#tD_xtn-qV2f>IQDPBLWwxMwS*y-movS@z+P*RebgR19~Yi
zS`F|9^&OS$U<?Z6m_~uoE1-OIuZNJ+!av+>;eb1Vt%}36m4yYTDtLo`SoH}I!p(c+
zFlZrIGMt2ULk;iAtFc0r-50lKw6G-8tM3j=Eh?JaemMFZG_%J+5(B-+)kR{ayW}|!
zv@_xKDs112^<HC;T*Q-J3jSk!E?B`rDvwivDPs)emqMUh(HYgpf0vi~aVYzQx{8q5
zNd*E?ufX+VC7C(TxutJ883*e@o{>|0#Lq~!W?Z62obePG*un%docExArwq{*-&(6>
z4m+bGzKD9r+n0y(O9M$Nc$40+tw}Mz&k~oYM>Ttl7zZ2PI90lz+^YoK_YBP!;IE<$
zne^8mQ_#LZ>$$s9Ghv4PoN^ctAHUNtwJ+(wK-z-my*ayQyfX#?{k;pMjbDC-nWccC
zcjsQ0$Npw%#X~I6hOZx4q8}ICRSKmZJXSk*hb~&%*YrnnFvL$R@OINXc#Z=$S+&dS
z&E9`15YUMsk!3+3;@)9aq+#1FvMIBOrI*n=w<Qa|T`xsK>lHXU>^=be=j*t~`Y?Sw
zv1ZxLkna3%<ff;H!NgtP<wJ&ILiM$RyafaB<(#aZeE2sYv%Hd|)>Di~?n(L-$3E(0
zDLSOUR-thsjkEu|fVQ5r;Shgug`A*&;^42ireucq=Pckq;p3?k5T-Rl9hzXJV{7|G
z>89oH7Gb^k-Y$y)r|ePxRV3DMOBhOKL44A2?}V+Dwn#;qHKzS$RMiuFFaxhE1LlWJ
z`#LsDAe;b{X?i<tK#j2zpE`dj*QZsj5ajlr!At37$*IbR63=-U<J~^{%7gSH-_Z!!
zV0|~(#ValD*}Dmc$8lU)ZOc;~bNf>k9UDpI@X<Z;>CEpy?WO^?K6A9cPRmy=GBeu$
ze-Uorrkmfc^FULhk>KpxMWwG~5QUT48+kw2YBy$^)$F%cPV_$iL;#YD>-=c$jM!%z
z1bk3a6#2N)uw7L6a}S;+R(;no;R2?ur10dr>*soB={pQ&{Lc^&vU|ig6R1{Un;4m<
zSTvq<9PU$DNpCKz=7wrplIr8JPcjTrza0%gNfB<Fm&2t{%J!oHpJwD)sgs5y+>H~%
zlDv{AldEnddd62;-;HrD(<|s2&(iZ1l2WgX9>o)zo@oi7p#zuyRrzl&I9}3%r)AlS
z6<)?IOUM+jvlUm<$ntMkR2|^JnO{MwgO0|@Tnh?xWj=4)!`cZ*!OoY()4pY=XjL%s
zfy1NG-~a`Sz(zL@%utA;M}lUjEcwQoH}jyrnuw$j?e|bjk%#&SgXD8|>-#;jI{vko
zb1Y)By|Rm<GR3BT2pqHPX#VEAeTa4Wk0Sw*wHW)h5`3Lt4R|Cx`}>qj%-u?yAe5oC
zb!ka?Iqz?5WhMa-eaELw&iPs~|DM(UV?x!6FfKMCAQHhWo^_!x{6Py)?QGFFKAfu7
zYVp$)Ajaqy<idOo--Q~X?5Y_w;jDPJ?_pafK9k6oc+9oF$~qXNPjW;hyVCgVci!D2
zE%)R%X!}KI%%Vs3n2<2X%sY{z3F(VD9@wJyM-3kg`iOO}|3K#-z?R4jwRU{5fb7D3
z^KOfdI{0^=ykJ{k^}$GDa$gVj3(@%eA7PNz?mnogIDYxZF_2c1lbLyEM2pheFgtXO
z9gll7OjOkziC#OaGU~WJ#_Q*)S)p$$`AicHJ=)EIY|Pw!IKtBFSp4l@BpJ{$QU9e*
zbXuaRQd(dxko_=gAlgbn_~(3Ku3T}ogK}Z`ADZ*!;W+6#5`Q3f{Hw~>Sw44EoCL}P
zlAaz3H^u2zk*dtsRWaSj32-dabDSh{Y9R35bNT2P7`xfKJ=%I$K*1+x8A|zoGNQdG
z3o|PYX`D`83W|<Tdmbq<4{WqJIl2f_UWj=T{R4wK)GZ>rP^AS0-S?McGI9Qb*_3_q
zX=BITzwLnDZ%1z=a7q`xyBS?L3_^NQBFXZoi|1&3ZDRra==E*;<qebYT%TX*X0B!6
zN@iB;+0on6`)uAu&O%XZc60Rgsj6xuFXOY{Z8MT-BvL>yi5v^Ce*^j*@olSs(DbLh
zQ;RbO$D9tTJq(qc9tKX3`egS3@&dp|#3R`VuOwQT_cV14vMK;#8@U1N*+KOHWxN!<
z7*0Gt_l%cbj)KR836cE|a%%ljE*#lftv2VO{6eJY?^AxY9aCry;;!LD;JhT|1>A-1
z1}-+f`uQ(fltX%<T~=1CoEKU)HNe$1wXWWXoOVb!AW+2q!Rx9PJ9@nN`I{maN2(M8
zITTj!snv03X0~7PI1oVo^3ved`C~1Yx{?V}@Hem3cd0YeC9az;<2OiCC;+PQs;DU%
zt)ukZ_XQ{x$Aj8M6pvnR*<M~@LWn5VhqFLd!%*_CAR?otTf<dn;%b(EUM<xAy`fO0
zR3|m<tE=uZ`AC6sFEbPV4g3RW_}1pZfpRPkyK-i2!&0CgZr<Y?>;On5)~OPvZ{0UP
zcxubkIs)gnLIgL=_Kh<$u<<Q9;y$A^W)3;I5<)Z#si>?xNgv>dBWjpX`pIQHf-rV}
zpilmIdmc5x`1-Y#l1tsWx1VF9WiaT6=2H)h$m9XM!Rl9qV3AXZc-#$*U}f9UBWakS
z9{G8yoc=O&dM3(`XkzIX8#pt$3}e5uGdQ%?0WsR%v@z>Ek=f!i@eFlYtwMXuh%(gr
zc3Rt*Q*jDGU4zdXZ&dzRWVD9>lnj!s<T4fzpAge)+24<fjL8E+VqGr63rzu2#7%<T
zJ8g-6)0)-gm;}yBuHDN64N29z8$a1mf<3PoY5zs<^UhgZg2i`o@{PU6k9#l?zH&un
z9sM1>ASN878*||E*=B^>?C5D-GbVuk*FU-(MbaBWGlk22eF2Xh02+Q+bi{Re-D{uR
zd?_V6I}I1lU-io7?LcL+gQcX`b2Yh<#$f{#3@qok>!}}^OZiox-A5^Mo%l#tq1IhE
zi`P08^feq%%eUs=8t7#wSw~ZI$3NT|{&l)>fa59hPU|yP-Sw#IPX(Bt8;*D7>c3NT
z_${|KPZ4C35v(=I{cmxJBR@qfSkhlnf@8v@_vr@Y>Y{BbMoD*QUB;2Rk!vbP!mX1z
z;U@DV7X>>X+5dRXDGu=45Q?2l&WYVv`{4hN#B_?d3lk-cB0`UBP@lWLO#n?aP~YZS
zD5;euX$;~VGEILF5{q3aHPFqP7*lh@d~%0pn=z!R2S%16RIzR$F;^|<g_GB0HGpII
zQC<ua`=dG<ekaMV>HK5Mq;c>YRtn_63ic19r>_)5^Ss`JbdKg$(`v=pbMXz2P;ak&
z>wq)e8YMjXE^HPmj`5N5_zLSPpJ$(A$~lK$g^-NqwgKT%S~8CDWw>HD{du-VXtGoy
zKJXQTKe|tfZp{({+ErHrR??E&b@Ic^%r%;8%!PI`AdT3Yw!XJBu(F3o4=!ey_E+2>
z5Af0?Nu8dV(0os;*$Rh0Fg&+wB_Vrs`FP`cxyf0j!jg}LsJKkKMjCZIsUz(Wc*_V;
zTB=Sge3s?01*UhbIYtu~p$*>3%sur<$jBW}YITo-U811H<xmvD?!29mWf$c}^SWl|
zmp|+<H);IchRH-mgI=L9XI91eq1SGyOzSmO9bEgAk1;oThQ**z`ifD88ZV*$gVo!9
zr$Hw*eRD8L%L$`=hQ2*G`5lHP<XtSqljDyYN0FY|Ci(rl;}^4eW9LT-sP24)g;(jx
zcKQC#-7=DG%P}`YX#<>FD_=|&4w)m^q1U0^V})+>-j`?4pFbbH+`Pt|64hv{gi*Ty
Q-(10vlTwyMh#Lm{KU9c+tpET3

literal 0
HcmV?d00001

diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py
index b5fc91cf3..9270c31b4 100644
--- a/deepnvme/file_access/py_load_cpu_tensor.py
+++ b/deepnvme/file_access/py_load_cpu_tensor.py
@@ -1,6 +1,6 @@
 import torch
 import os, timeit, functools
-from utils import parse_read_arguments
+from utils import parse_read_arguments, GIGA_UNIT
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
@@ -15,8 +15,8 @@ def main():
 
     t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py load_cpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
+    print(f'py load_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py
index 48e00ae3d..57eca16f6 100644
--- a/deepnvme/file_access/py_load_gpu_tensor.py
+++ b/deepnvme/file_access/py_load_gpu_tensor.py
@@ -1,6 +1,6 @@
 import torch
 import os, timeit, functools
-from utils import parse_read_arguments
+from utils import parse_read_arguments, GIGA_UNIT
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
@@ -15,8 +15,8 @@ def main():
     
     t = timeit.Timer(functools.partial(file_read, input_file))
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py load_gpu:  {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
+    print(f'py load_gpu:  {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
 
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py
index 09152e19c..f6a0d6c88 100644
--- a/deepnvme/file_access/py_store_cpu_tensor.py
+++ b/deepnvme/file_access/py_store_cpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools
 import pathlib
-from utils import parse_write_arguments
+from utils import parse_write_arguments, GIGA_UNIT
 
 def file_write(out_f, t):
     with open(out_f, 'wb') as f:
@@ -18,8 +18,8 @@ def main():
     t = timeit.Timer(functools.partial(file_write, output_file, cpu_tensor))
 
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py store_cpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
+    print(f'py store_cpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 if __name__ == "__main__":
diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py
index 57303af77..b0f6fa387 100644
--- a/deepnvme/file_access/py_store_gpu_tensor.py
+++ b/deepnvme/file_access/py_store_gpu_tensor.py
@@ -1,7 +1,7 @@
 import torch
 import os, timeit, functools
 import pathlib
-from utils import parse_write_arguments
+from utils import parse_write_arguments, GIGA_UNIT
 
 def file_write(out_f, t):
     with open(out_f, 'wb') as f:
@@ -18,8 +18,8 @@ def main():
     t = timeit.Timer(functools.partial(file_write, output_file, gpu_tensor))
 
     py_t = t.timeit(cnt)
-    py_gbs = (cnt*file_sz)/py_t/1e9
-    print(f'py store_gpu: {file_sz/(1024**3)}GB, {py_gbs:5.2f} GB/sec, {py_t:5.2f} secs')
+    py_gbs = (cnt*file_sz)/GIGA_UNIT/py_t
+    print(f'py store_gpu: {file_sz/GIGA_UNIT} GB, {py_t/cnt} secs, {py_gbs:5.2f} GB/sec')
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 
diff --git a/deepnvme/file_access/run_load_tensor.sh b/deepnvme/file_access/run_load_tensor.sh
index ac3440782..e410c98b9 100644
--- a/deepnvme/file_access/run_load_tensor.sh
+++ b/deepnvme/file_access/run_load_tensor.sh
@@ -13,7 +13,7 @@ fi
 
 
 echo "Running load tensor examples using $input_file"
-for f in bounce_buffer_load_cpu_tensor.py bounce_buffer_load_gpu_tensor.py \
+for f in aio_load_cpu_tensor.py aio_load_gpu_tensor.py \
     gds_load_gpu_tensor.py \
     py_load_cpu_tensor.py py_load_gpu_tensor.py; do 
     cmd="python $f --input_file $input_file"
diff --git a/deepnvme/file_access/run_store_tensor.sh b/deepnvme/file_access/run_store_tensor.sh
index 220d69247..a10b3c219 100644
--- a/deepnvme/file_access/run_store_tensor.sh
+++ b/deepnvme/file_access/run_store_tensor.sh
@@ -13,7 +13,7 @@ fi
 
 
 echo "Running store tensor examples using $output_folder"
-for f in bounce_buffer_store_cpu_tensor.py bounce_buffer_store_gpu_tensor.py \
+for f in aio_store_cpu_tensor.py aio_store_gpu_tensor.py \
     gds_store_gpu_tensor.py \
     py_store_cpu_tensor.py py_store_gpu_tensor.py; do 
     cmd="python $f --nvme_folder $output_folder"
diff --git a/deepnvme/file_access/utils.py b/deepnvme/file_access/utils.py
index 702f87337..e83168349 100644
--- a/deepnvme/file_access/utils.py
+++ b/deepnvme/file_access/utils.py
@@ -1,6 +1,8 @@
 import os
 import argparse
 
+GIGA_UNIT = 1024**3
+
 def parse_read_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--input_file',

From 31b62e87d6f73d4fbf89c02ad22fb22add58dc7b Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 12 Aug 2024 22:00:27 -0400
Subject: [PATCH 11/13] Address renaming feedback

---
 deepnvme/file_access/aio_load_cpu_tensor.py  | 4 ++--
 deepnvme/file_access/aio_load_gpu_tensor.py  | 4 ++--
 deepnvme/file_access/aio_store_cpu_tensor.py | 6 +++---
 deepnvme/file_access/aio_store_gpu_tensor.py | 6 +++---
 deepnvme/file_access/gds_load_gpu_tensor.py  | 4 ++--
 deepnvme/file_access/gds_store_gpu_tensor.py | 6 +++---
 deepnvme/file_access/py_load_cpu_tensor.py   | 4 ++--
 deepnvme/file_access/py_load_gpu_tensor.py   | 4 ++--
 deepnvme/file_access/py_store_cpu_tensor.py  | 4 ++--
 deepnvme/file_access/py_store_gpu_tensor.py  | 4 ++--
 10 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/deepnvme/file_access/aio_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py
index 3fdc624ed..c75d1ba7c 100644
--- a/deepnvme/file_access/aio_load_cpu_tensor.py
+++ b/deepnvme/file_access/aio_load_cpu_tensor.py
@@ -3,8 +3,8 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_read_arguments, GIGA_UNIT
 
-def file_read(inp_f, h, bounce_buffer):
-    h.sync_pread(bounce_buffer, inp_f)
+def file_read(inp_f, handle, bounce_buffer):
+    handle.sync_pread(bounce_buffer, inp_f)
     return bounce_buffer.cpu()
 
 def main():
diff --git a/deepnvme/file_access/aio_load_gpu_tensor.py b/deepnvme/file_access/aio_load_gpu_tensor.py
index adf67c4be..5720676a6 100644
--- a/deepnvme/file_access/aio_load_gpu_tensor.py
+++ b/deepnvme/file_access/aio_load_gpu_tensor.py
@@ -3,8 +3,8 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_read_arguments, GIGA_UNIT
 
-def file_read(inp_f, h, bounce_buffer):
-    h.sync_pread(bounce_buffer, inp_f)
+def file_read(inp_f, handle, bounce_buffer):
+    handle.sync_pread(bounce_buffer, inp_f)
     return bounce_buffer.cuda()
 
 
diff --git a/deepnvme/file_access/aio_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py
index 90d21e7c4..6f257e5a2 100644
--- a/deepnvme/file_access/aio_store_cpu_tensor.py
+++ b/deepnvme/file_access/aio_store_cpu_tensor.py
@@ -3,9 +3,9 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_write_arguments, GIGA_UNIT
 
-def file_write(out_f, t, h, bounce_buffer):
-    bounce_buffer.copy_(t)
-    h.sync_pwrite(bounce_buffer, out_f)
+def file_write(out_f, tensor, handle, bounce_buffer):
+    bounce_buffer.copy_(tensor)
+    handle.sync_pwrite(bounce_buffer, out_f)
 
 def main():
     args = parse_write_arguments()
diff --git a/deepnvme/file_access/aio_store_gpu_tensor.py b/deepnvme/file_access/aio_store_gpu_tensor.py
index d67ec4ff4..d00ee17c3 100644
--- a/deepnvme/file_access/aio_store_gpu_tensor.py
+++ b/deepnvme/file_access/aio_store_gpu_tensor.py
@@ -3,9 +3,9 @@
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_write_arguments, GIGA_UNIT
 
-def file_write(out_f, t, h, bounce_buffer):
-    bounce_buffer.copy_(t)
-    h.sync_pwrite(bounce_buffer, out_f)
+def file_write(out_f, tensor, handle, bounce_buffer):
+    bounce_buffer.copy_(tensor)
+    handle.sync_pwrite(bounce_buffer, out_f)
 
 def main():
     args = parse_write_arguments()
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
index 1fd440b75..a9220517f 100644
--- a/deepnvme/file_access/gds_load_gpu_tensor.py
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -3,8 +3,8 @@
 from utils import parse_read_arguments, GIGA_UNIT
 from deepspeed.ops.op_builder import GDSBuilder
 
-def file_read(inp_f, h, gpu_buffer):
-    h.sync_pread(gpu_buffer, inp_f)
+def file_read(inp_f, handle, gpu_buffer):
+    handle.sync_pread(gpu_buffer, inp_f)
     return gpu_buffer.cuda()
 
 def main():
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
index e369e2237..0b1b9a542 100644
--- a/deepnvme/file_access/gds_store_gpu_tensor.py
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -3,9 +3,9 @@
 from deepspeed.ops.op_builder import GDSBuilder
 from utils import parse_write_arguments, GIGA_UNIT
 
-def file_write(out_f, t, h, gpu_buffer):
-    gpu_buffer.copy_(t)
-    h.sync_pwrite(gpu_buffer, out_f)
+def file_write(out_f, tensor, handle, gpu_buffer):
+    gpu_buffer.copy_(tensor)
+    handle.sync_pwrite(gpu_buffer, out_f)
 
 def main():
     args = parse_write_arguments()
diff --git a/deepnvme/file_access/py_load_cpu_tensor.py b/deepnvme/file_access/py_load_cpu_tensor.py
index 9270c31b4..0650848f0 100644
--- a/deepnvme/file_access/py_load_cpu_tensor.py
+++ b/deepnvme/file_access/py_load_cpu_tensor.py
@@ -4,8 +4,8 @@
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8)
-    return t 
+       tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
+    return tensor
 
 def main():
     args = parse_read_arguments()
diff --git a/deepnvme/file_access/py_load_gpu_tensor.py b/deepnvme/file_access/py_load_gpu_tensor.py
index 57eca16f6..976967dca 100644
--- a/deepnvme/file_access/py_load_gpu_tensor.py
+++ b/deepnvme/file_access/py_load_gpu_tensor.py
@@ -4,8 +4,8 @@
 
 def file_read(inp_f):
     with open(inp_f, 'rb') as f:
-       t = torch.frombuffer(f.read(), dtype=torch.uint8)
-    return t.cuda()
+       tensor = torch.frombuffer(f.read(), dtype=torch.uint8)
+    return tensor.cuda()
 
 def main():
     args = parse_read_arguments()
diff --git a/deepnvme/file_access/py_store_cpu_tensor.py b/deepnvme/file_access/py_store_cpu_tensor.py
index f6a0d6c88..50e477186 100644
--- a/deepnvme/file_access/py_store_cpu_tensor.py
+++ b/deepnvme/file_access/py_store_cpu_tensor.py
@@ -3,9 +3,9 @@
 import pathlib
 from utils import parse_write_arguments, GIGA_UNIT
 
-def file_write(out_f, t):
+def file_write(out_f, tensor):
     with open(out_f, 'wb') as f:
-       f.write(t.numpy(force=True))
+       f.write(tensor.numpy(force=True))
 
 def main():
     args = parse_write_arguments()
diff --git a/deepnvme/file_access/py_store_gpu_tensor.py b/deepnvme/file_access/py_store_gpu_tensor.py
index b0f6fa387..a64209a12 100644
--- a/deepnvme/file_access/py_store_gpu_tensor.py
+++ b/deepnvme/file_access/py_store_gpu_tensor.py
@@ -3,9 +3,9 @@
 import pathlib
 from utils import parse_write_arguments, GIGA_UNIT
 
-def file_write(out_f, t):
+def file_write(out_f, tensor):
     with open(out_f, 'wb') as f:
-       f.write(t.numpy(force=True))
+       f.write(tensor.numpy(force=True))
 
 def main():
     args = parse_write_arguments()

From 9b68819f3b383d34403d81db9858e8ddf20d782e Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 14 Aug 2024 09:13:08 -0400
Subject: [PATCH 12/13] Add operator setup instructions

---
 deepnvme/file_access/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/deepnvme/file_access/README.md b/deepnvme/file_access/README.md
index d9452829f..1183908d8 100644
--- a/deepnvme/file_access/README.md
+++ b/deepnvme/file_access/README.md
@@ -1,6 +1,6 @@
 # Using DeepNVMe for simple file reads and writes involving CPU/GPU tensors
 
-The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA GPUDirect Storage (GDS) as appropriate. 
+The purpose of this folder is to provide example codes that illustrate how to use DeepNVMe for simple file operations of moving raw data bytes between persistent storage and CPU/GPU tensors. For each file operation, we provide an implementation using Python I/O functionality, and a DeepNVMe implementation using CPU bounce buffer (aio) and NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS) as appropriate. 
 
 The following table is a mapping of file operations to the corresponding Python and DeepNVMe implementations. 
 
@@ -20,12 +20,20 @@ Ensure your environment is properly configured to run these examples. First, you
 <div align="center">
     <img src="./media/deepnvme_ops_report.png" style="width:6.5in;height:3.42153in" />
 </div> 
-
 <div align="center">
     ds_report output showing availability of DeepNVMe operators (async_io and gds) in a DeepSpeed installation. 
 </div> 
 
 
+If `async_io` opertator is unavailable, you will need to install the appropriate `libaio` library binaries for your Linux flavor. For example, Ubuntu users will need to run `apt install libaio-dev`. In general, you should carefully inspect `ds_report` output for helpful tips such as the following: 
+
+```bash
+[WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
+[WARNING]  async_io: please install the libaio-dev package with apt
+[WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+```
+
+To enable `gds` operator, you will need to install NVIDIA GDS by consulting the appropriate guide for [bare-metal systems](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html) or Azure VMs (coming soon). 
 
 ## Tensor Load Examples
 The tensor load example scripts share a common command-line interface, which is illustrated below using `py_read_load_cpu_tensor.py`.

From a595e39763fe4d1d9c1be8904112229c17df13ea Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 16 Aug 2024 10:10:57 -0400
Subject: [PATCH 13/13] Handle init with default args

---
 deepnvme/file_access/aio_load_cpu_tensor.py  | 2 +-
 deepnvme/file_access/aio_load_gpu_tensor.py  | 2 +-
 deepnvme/file_access/aio_store_cpu_tensor.py | 2 +-
 deepnvme/file_access/aio_store_gpu_tensor.py | 2 +-
 deepnvme/file_access/gds_load_gpu_tensor.py  | 2 +-
 deepnvme/file_access/gds_store_gpu_tensor.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/deepnvme/file_access/aio_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py
index c75d1ba7c..27a1e61c5 100644
--- a/deepnvme/file_access/aio_load_cpu_tensor.py
+++ b/deepnvme/file_access/aio_load_cpu_tensor.py
@@ -13,7 +13,7 @@ def main():
     file_sz = os.path.getsize(input_file)
     cnt = args.loop
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    aio_handle = AsyncIOBuilder().load().aio_handle()
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
diff --git a/deepnvme/file_access/aio_load_gpu_tensor.py b/deepnvme/file_access/aio_load_gpu_tensor.py
index 5720676a6..aeecc6e5d 100644
--- a/deepnvme/file_access/aio_load_gpu_tensor.py
+++ b/deepnvme/file_access/aio_load_gpu_tensor.py
@@ -14,7 +14,7 @@ def main():
     file_sz = os.path.getsize(input_file)
     cnt = args.loop
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    aio_handle = AsyncIOBuilder().load().aio_handle()
     bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
diff --git a/deepnvme/file_access/aio_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py
index 6f257e5a2..20c03792b 100644
--- a/deepnvme/file_access/aio_store_cpu_tensor.py
+++ b/deepnvme/file_access/aio_store_cpu_tensor.py
@@ -15,7 +15,7 @@ def main():
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    aio_handle = AsyncIOBuilder().load().aio_handle()
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
diff --git a/deepnvme/file_access/aio_store_gpu_tensor.py b/deepnvme/file_access/aio_store_gpu_tensor.py
index d00ee17c3..71a4aa7bb 100644
--- a/deepnvme/file_access/aio_store_gpu_tensor.py
+++ b/deepnvme/file_access/aio_store_gpu_tensor.py
@@ -15,7 +15,7 @@ def main():
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    aio_handle = AsyncIOBuilder().load().aio_handle(1024**2, 128, True, True, 1)
+    aio_handle = AsyncIOBuilder().load().aio_handle()
     bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
diff --git a/deepnvme/file_access/gds_load_gpu_tensor.py b/deepnvme/file_access/gds_load_gpu_tensor.py
index a9220517f..dd6273707 100644
--- a/deepnvme/file_access/gds_load_gpu_tensor.py
+++ b/deepnvme/file_access/gds_load_gpu_tensor.py
@@ -13,7 +13,7 @@ def main():
     file_sz = os.path.getsize(input_file)
     cnt = args.loop
 
-    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_handle = GDSBuilder().load().gds_handle()
     gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False))
 
     t = timeit.Timer(functools.partial(file_read, input_file, gds_handle, gds_buffer))
diff --git a/deepnvme/file_access/gds_store_gpu_tensor.py b/deepnvme/file_access/gds_store_gpu_tensor.py
index 0b1b9a542..06ba508ba 100644
--- a/deepnvme/file_access/gds_store_gpu_tensor.py
+++ b/deepnvme/file_access/gds_store_gpu_tensor.py
@@ -15,7 +15,7 @@ def main():
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cuda', requires_grad=False)
 
-    gds_handle = GDSBuilder().load().gds_handle(1024**2, 128, True, True, 1)
+    gds_handle = GDSBuilder().load().gds_handle()
     gds_buffer = gds_handle.new_pinned_device_tensor(file_sz, torch.empty(0, dtype=torch.uint8, device='cuda', requires_grad=False))
 
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, gds_handle, gds_buffer))