added h5json_writer

HDFGroup · Feb 23, 2025 · 4b9cb68 · 4b9cb68
1 parent 2f546b9
commit 4b9cb68
Show file tree

Hide file tree

Showing 9 changed files with 719 additions and 33 deletions.
diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py
@@ -94,7 +94,7 @@ def make_new_dset(
 
 
     # TBD - other properties
-    dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl}
+    dset_json = {"shape": shape_json, "type": type_json, "cpl": cpl, "attributes": {}}
     dset_json["created"] = time.time()
     dset_json["modified"] = None
 

diff --git a/src/h5json/h5json_writer.py b/src/h5json/h5json_writer.py
@@ -0,0 +1,256 @@
+##############################################################################
+# Copyright by The HDF Group.                                                #
+# All rights reserved.                                                       #
+#                                                                            #
+# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and      #
+# Utilities.  The full HDF5 REST Server copyright notice, including          #
+# terms governing use, modification, and redistribution, is contained in     #
+# the file COPYING, which can be found at the root of the source code        #
+# distribution tree.  If you do not have access to this file, you may        #
+# request a copy from help@hdfgroup.org.                                     #
+##############################################################################
+
+import json
+
+from .h5writer import H5Writer
+from .objid import stripId, getCollectionForId
+
+class H5JsonWriter(H5Writer):
+    """
+    This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 
+    compatible storage medium.  
+    """
+
+
+    def __init__(
+        self,
+        filepath,
+        append=False,
+        no_data=False,
+        app_logger=None
+    ):
+        super().__init__(filepath, append=append, app_logger=app_logger)
+        self.alias_db = {}
+        self.json = {}
+        self._no_data = no_data
+        self._root_uuid = None
+
+    def flush(self):
+        """ Write dirty items """
+        # json writer doesn't support incremental updates, so we'll wait
+        # for close to write out database
+        self.log.info("flush")
+
+    def close(self):
+        """ close storage handle """
+        self.dumpFile()
+
+
+    def _setAlias(self, obj_id, id_set, h5path):
+        """ add the given h5path to the object's alias list
+            If the object is a group, recurse through each hard link """
+        obj_json = self.db.getObjectById(obj_id)
+        alias_list = self.alias_db[obj_id]
+        if h5path in alias_list:
+            return  # nothing to do
+        alias_list.append(h5path)
+        if getCollectionForId(obj_id) != "groups":
+            return  # done
+        id_set.add(obj_id)  # keep track of objects we've visited to avoid loops
+        links = obj_json["links"]
+        if h5path[-1] != '/':
+            h5path += '/'
+
+        for link_name in links:
+            link_json = links[link_name]
+            if link_json["class"] == "H5L_TYPE_HARD":
+                tgt_id = link_json["id"]
+                if tgt_id in id_set:
+                    self.log.info(f"_setAlias - circular loop found")
+                else:
+                    self._setAlias(tgt_id, id_set, h5path+link_name)
+        id_set.remove(obj_id)
+
+    def getAliasList(self):
+        """ update the alias list for each object """
+        # clear exiting aliases
+        obj_ids = self.db.getCollection()
+        for obj_id in obj_ids:
+            self.alias_db[obj_id] = []
+
+        self._setAlias(self._root_uuid, set(), "/")
+
+
+    def dumpAttribute(self, obj_id, attr_name):
+        self.log.info(f"dumpAttribute: [{attr_name}]")
+        item = self.db.getAttribute(obj_id, attr_name)
+        response = {"name": attr_name}
+        response["type"] = item["type"]
+        response["shape"] = item["shape"]
+        if True:  #not self.options.D:
+            if "value" not in item:
+                self.log.warning("no value key in attribute: " + attr_name)
+            else:
+                # dump values unless header -D was passed
+                response["value"] = item["value"]  
+        return response
+
+    def dumpAttributes(self, obj_id):
+        attrs = self.db.getAttributes(obj_id)
+        self.log.info(f"dumpAttributes: {obj_id}")
+        items = []
+        for attr_name in attrs:
+            item = self.dumpAttribute(obj_id, attr_name)
+            items.append(item)
+
+        return items
+
+    def dumpLink(self, obj_id, name):
+        item = self.db.getLink(obj_id, name)
+        response = {"class": item["class"]}
+        if "id" in item:
+            tgt_id = item["id"]
+            response["collection"] = getCollectionForId(tgt_id)
+            response["id"] = stripId(tgt_id)
+
+        for key in item:
+            if key in ("id", "created", "modified"):
+                continue
+            response[key] = item[key]
+        response["title"] = name
+        return response
+
+    def dumpLinks(self, obj_id):
+        links = self.db.getLinks(obj_id)
+        items = []
+        for link_name in links:
+            item = self.dumpLink(obj_id, link_name)
+            items.append(item)
+        return items
+
+    def dumpGroup(self, obj_id):
+        item = self.db.getObjectById(obj_id)
+        response = {}
+        alias = self.alias_db[obj_id]
+        response["alias"] = alias
+
+        if "cpl" in item:
+            item["creationProperties"] = item["cpl"]
+        attributes = self.dumpAttributes(obj_id)
+        if attributes:
+            response["attributes"] = attributes
+        links = self.dumpLinks(obj_id)
+        if links:
+            response["links"] = links
+        return response
+
+    def dumpGroups(self):
+        groups = {}
+        item = self.dumpGroup(self._root_uuid)
+        root_uuid = stripId(self._root_uuid)
+        groups[root_uuid] = item
+        obj_ids = self.db.getCollection("groups")
+        for obj_id in obj_ids:
+            if obj_id == self._root_uuid:
+                continue
+            item = self.dumpGroup(obj_id)
+            obj_uuid = stripId(obj_id)
+            groups[obj_uuid] = item
+
+        self.json["groups"] = groups
+
+    def dumpDataset(self, obj_id):
+        response = {}
+        self.log.info("dumpDataset: " + obj_id)
+        item = self.db.getObjectById(obj_id)
+        if "alias" in item:
+            alias = item["alias"]
+            if alias:
+                self.log.info(f"dumpDataset alias: [{alias[0]}]")
+            response["alias"] = item["alias"]
+
+        response["type"] = item["type"]
+        shapeItem = item["shape"]
+        shape_rsp = {}
+        num_elements = 1
+        shape_rsp["class"] = shapeItem["class"]
+        if "dims" in shapeItem:
+            shape_rsp["dims"] = shapeItem["dims"]
+            for dim in shapeItem["dims"]:
+                num_elements *= dim
+        if "maxdims" in shapeItem:
+            maxdims = []
+            for dim in shapeItem["maxdims"]:
+                if dim == 0:
+                    maxdims.append("H5S_UNLIMITED")
+                else:
+                    maxdims.append(dim)
+            shape_rsp["maxdims"] = maxdims
+        response["shape"] = shape_rsp
+
+        if "cpl" in item:
+            response["creationProperties"] = item["cpl"]
+
+        attributes = self.dumpAttributes(obj_id)
+        if attributes:
+            response["attributes"] = attributes
+
+        if not self._no_data:
+            if num_elements > 0:
+                value = self.db.getDatasetValues(obj_id)
+                response["value"] = value  # dump values unless header flag was passed
+            else:
+                response["value"] = []  # empty list
+        return response
+
+    def dumpDatasets(self):
+        obj_ids = self.db.getCollection("datasets")
+        if obj_ids:
+            datasets = {}
+            for obj_id in obj_ids:
+                item = self.dumpDataset(obj_id)
+                datasets[obj_id] = item
+
+            self.json["datasets"] = datasets
+
+    def dumpDatatype(self, obj_id):
+        response = {}
+        item = self.db.getObjectById(obj_id)
+        response["alias"] = item["alias"]
+        response["type"] = item["type"]
+        if "cpl" in item:
+            response["creationProperties"] = item["cpl"]
+        attributes = self.dumpAttributes(obj_id)
+        if attributes:
+            response["attributes"] = attributes
+        return response
+
+    def dumpDatatypes(self):
+        obj_ids = self.db.getCollection("datatypes")
+        if obj_ids:
+            datatypes = {}
+            for obj_id in obj_ids:
+                item = self.dumpDatatype(obj_id)
+                datatypes[obj_id] = item
+
+            self.json["datatypes"] = datatypes
+
+
+    def dumpFile(self):
+        self._root_uuid = self.db.getObjectIdByPath("/")
+
+        db_version_info = self.db.getVersionInfo()
+
+        self.json["apiVersion"] = db_version_info["hdf5-json-version"]
+        self.json["root"] = stripId(self._root_uuid)
+        self.getAliasList()  # create alias_db with obj_id to alias list dict
+        self.dumpGroups()
+
+        self.dumpDatasets()
+
+        self.dumpDatatypes()
+
+        print(json.dumps(self.json, sort_keys=True, indent=4))
+
+
+
diff --git a/src/h5json/h5py_reader.py b/src/h5json/h5py_reader.py
@@ -9,8 +9,6 @@
 # distribution tree.  If you do not have access to this file, you may        #
 # request a copy from help@hdfgroup.org.                                     #
 ##############################################################################
-import logging
-
 import h5py
 import numpy as np
 
@@ -42,13 +40,16 @@ def __init__(
         filepath,
         app_logger=None
     ):
+        self._id_map = {}
+        self._addr_map = {}
+        """
         if app_logger:
             self.log = app_logger
         else:
             self.log = logging.getLogger()
-        self._id_map = {}
-        self._addr_map = {}
         self._filepath = filepath
+        """
+        super().__init__(filepath, app_logger=app_logger)
         f = h5py.File(self._filepath)
         self._f = f
         self._root_id = createObjId(obj_type="groups")
@@ -182,7 +183,7 @@ def _getLinks(self, grp):
         return items
 
     def _getGroup(self, grp, include_links=True):
-        self.log.info("_getGroup alias: [{grp.name}]")
+        self.log.info(f"_getGroup alias: [{grp.name}]")
 
         item = {"alias": grp.name}
 
@@ -192,15 +193,15 @@ def _getGroup(self, grp, include_links=True):
         return item
 
     def _getDatatype(self, ctype, include_attrs=True):
-        self.log.info("getDatatype alias: ]{ctype.name}")
+        self.log.info(f"getDatatype alias: ]{ctype.name}")
         item = {"alias": ctype.name}
         item["type"] = getTypeItem(ctype.dtype)
 
         return item
 
 
     def _getDataset(self, dset):     
-        self.log.info("getDataset alias: [{dset.name}]")
+        self.log.info(f"getDataset alias: [{dset.name}]")
 
         item = {"alias": dset.name}
 

diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py
@@ -11,7 +11,7 @@
 ##############################################################################
 from abc import ABC, abstractmethod
 
-
+import logging
 
 
 class H5Reader(ABC):
@@ -23,9 +23,14 @@ class H5Reader(ABC):
 
     def __init__(
         self,
-        filepath
+        filepath,
+        app_logger=None
     ):
         self._filepath = filepath
+        if app_logger:
+            self.log = app_logger
+        else:
+            self.log = logging.getLogger()
 
     @abstractmethod
     def get_root_id(self):