From e5d85fd3fe670e3a095474a52d6b60b333c0096c Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 09:10:31 +0000 Subject: [PATCH 1/5] Minimal change to avoid a numpy deprecation failure. Without this all tests fail at python 3.10 and numpy 1.24.4. With it, they all pass. --- pyfive/dataobjects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 181f3cf..886b48f 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -197,7 +197,7 @@ def _attr_value(self, dtype, buf, count, offset): """ Retrieve an HDF5 attribute value from a buffer. """ if isinstance(dtype, tuple): dtype_class = dtype[0] - value = np.empty(count, dtype=np.object) + value = np.empty(count, dtype=object) for i in range(count): if dtype_class == 'VLEN_STRING': _, _, character_set = dtype From 473fa361d464b6c6066757897e93b26d243d35fd Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 09:17:04 +0000 Subject: [PATCH 2/5] Temporary class to explore the use of the chunk index, and eventually, hopefully, address both the needs of pyactivestorage (which needs access to the b-tree chunk index) and https://github.com/jjhelmus/pyfive/issues/6 --- pyfive/as_dataobjects.py | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 pyfive/as_dataobjects.py diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py new file mode 100644 index 0000000..b811fb3 --- /dev/null +++ b/pyfive/as_dataobjects.py @@ -0,0 +1,67 @@ +from .dataobjects import DataObjects, DATA_STORAGE_MSG_TYPE +from .datatype_msg import DatatypeMessage +import numpy as np +from .btree import BTreeV1RawDataChunks + +class ADataObjects(DataObjects): + """ + Subclass of DataObjets which access the chunk addresses for a given slice of data + """ + def __init__(self,*args,**kwargs): + """ + Initialise via super class + """ + super().__init__(*args,**kwargs) + + # not yet sure we need our own copy + self._as_chunk_index=[] + + def get_offset_addresses(self, args=None): + """ + Get the offset addresses for the data requested + """ + + # offset and size from data storage message + msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] + msg_offset = msg['offset_to_message'] + version, dims, layout_class, property_offset = ( + self._get_data_message_properties(msg_offset)) + + if layout_class == 0: # compact storage + raise NotImplementedError("Compact storage") + elif layout_class == 1: # contiguous storage + return NotImplementedError("Contiguous storage") + if layout_class == 2: # chunked storage + return self._as_get_chunk_addresses(args) + + + def _as_get_chunk_addresses(self, args): + """ + Get the offset addresses associated with all the chunks + known to the b-tree of this object + """ + self._get_chunk_params() + + if self._as_chunk_index == []: + chunk_btree = BTreeV1RawDataChunks( + self.fh, self._chunk_address, self._chunk_dims) + + count = np.prod(self.shape) + itemsize = np.dtype(self.dtype).itemsize + chunk_buffer_size = count * itemsize + + for node in chunk_btree.all_nodes[0]: + for node_key, addr in zip(node['keys'], node['addresses']): + size = chunk_buffer_size + if self.filter_pipeline: + size = node_key['chunk_size'] + start = node_key['chunk_offset'][:-1] + region = [slice(i, i+j) for i, j in zip(start, self.shape)] + self._as_chunk_index.append([region, start, size]) + + if args is not None: + return NotImplementedError + return self._as_chunk_index + + + From 3ee8f2553892e846b18d2ebe9525bd075e9f88d4 Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 09:19:09 +0000 Subject: [PATCH 3/5] Temporary file for use while constructing functionalit. Will turn into a test at some point. --- bnl/playing.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 bnl/playing.py diff --git a/bnl/playing.py b/bnl/playing.py new file mode 100644 index 0000000..ebd6b01 --- /dev/null +++ b/bnl/playing.py @@ -0,0 +1,27 @@ +import pyfive +from pathlib import Path +from pyfive.as_dataobjects import ADataObjects + +MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' +MYFILE = '../tests/chunked.hdf5' +MYPATH = Path(__file__).parent + +#f = h5py.File(MYPATH/MYFILE,'r') +f2 = pyfive.File(MYPATH/MYFILE) +path = 'dataset1' +link_target = f2._links[path] +dsref = ADataObjects(f2.file._fh, link_target) +chunk_index = dsref.get_offset_addresses() +print(chunk_index) + + +#v='tos' +#tos =f2[v] +#v='dataset1' +#print(tos) +#x = tos[2,:] +#print(x) +#print(tos.shape) + + + From dd13033f29979c1e672d096c1426cb321ae7833d Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 10:45:20 +0000 Subject: [PATCH 4/5] Fixing the master (moved play to branch) --- bnl/playing.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 bnl/playing.py diff --git a/bnl/playing.py b/bnl/playing.py deleted file mode 100644 index ebd6b01..0000000 --- a/bnl/playing.py +++ /dev/null @@ -1,27 +0,0 @@ -import pyfive -from pathlib import Path -from pyfive.as_dataobjects import ADataObjects - -MYFILE = 'da193o_25_day__grid_T_198807-198807.nc' -MYFILE = '../tests/chunked.hdf5' -MYPATH = Path(__file__).parent - -#f = h5py.File(MYPATH/MYFILE,'r') -f2 = pyfive.File(MYPATH/MYFILE) -path = 'dataset1' -link_target = f2._links[path] -dsref = ADataObjects(f2.file._fh, link_target) -chunk_index = dsref.get_offset_addresses() -print(chunk_index) - - -#v='tos' -#tos =f2[v] -#v='dataset1' -#print(tos) -#x = tos[2,:] -#print(x) -#print(tos.shape) - - - From 719a2c8ac7c030cd419f2e2ec80a05ec5e2c5d3f Mon Sep 17 00:00:00 2001 From: Bryan Lawrence Date: Thu, 22 Feb 2024 10:51:23 +0000 Subject: [PATCH 5/5] This shouldn't be on the main right now (but it's still on the play branch). --- pyfive/as_dataobjects.py | 67 ---------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 pyfive/as_dataobjects.py diff --git a/pyfive/as_dataobjects.py b/pyfive/as_dataobjects.py deleted file mode 100644 index b811fb3..0000000 --- a/pyfive/as_dataobjects.py +++ /dev/null @@ -1,67 +0,0 @@ -from .dataobjects import DataObjects, DATA_STORAGE_MSG_TYPE -from .datatype_msg import DatatypeMessage -import numpy as np -from .btree import BTreeV1RawDataChunks - -class ADataObjects(DataObjects): - """ - Subclass of DataObjets which access the chunk addresses for a given slice of data - """ - def __init__(self,*args,**kwargs): - """ - Initialise via super class - """ - super().__init__(*args,**kwargs) - - # not yet sure we need our own copy - self._as_chunk_index=[] - - def get_offset_addresses(self, args=None): - """ - Get the offset addresses for the data requested - """ - - # offset and size from data storage message - msg = self.find_msg_type(DATA_STORAGE_MSG_TYPE)[0] - msg_offset = msg['offset_to_message'] - version, dims, layout_class, property_offset = ( - self._get_data_message_properties(msg_offset)) - - if layout_class == 0: # compact storage - raise NotImplementedError("Compact storage") - elif layout_class == 1: # contiguous storage - return NotImplementedError("Contiguous storage") - if layout_class == 2: # chunked storage - return self._as_get_chunk_addresses(args) - - - def _as_get_chunk_addresses(self, args): - """ - Get the offset addresses associated with all the chunks - known to the b-tree of this object - """ - self._get_chunk_params() - - if self._as_chunk_index == []: - chunk_btree = BTreeV1RawDataChunks( - self.fh, self._chunk_address, self._chunk_dims) - - count = np.prod(self.shape) - itemsize = np.dtype(self.dtype).itemsize - chunk_buffer_size = count * itemsize - - for node in chunk_btree.all_nodes[0]: - for node_key, addr in zip(node['keys'], node['addresses']): - size = chunk_buffer_size - if self.filter_pipeline: - size = node_key['chunk_size'] - start = node_key['chunk_offset'][:-1] - region = [slice(i, i+j) for i, j in zip(start, self.shape)] - self._as_chunk_index.append([region, start, size]) - - if args is not None: - return NotImplementedError - return self._as_chunk_index - - -