Add lookup verb and verbs-taking-arguments infrastructure

- Verbs can now be defined with a class that encapsulates the cli and notebook interfaces. - Verbs use a registry similar to models and datasets, but without support for external verbs - Lookup verb with a CLI and notebook interface - Inference now creates an index of object_id -> batch to facilitate lookups in the batch numpy files - Backwards compatibility: Lookup verb will regenerate the object_id-> batch index if it does not exist. - Stub implementation of similarity search verb.
lincc-frameworks · Jan 27, 2025 · c38b5bf · c38b5bf
1 parent 50ba387
commit c38b5bf
Show file tree

Hide file tree

Showing 8 changed files with 364 additions and 9 deletions.
diff --git a/src/fibad/fibad.py b/src/fibad/fibad.py
@@ -4,18 +4,17 @@
 from typing import Optional, Union
 
 from .config_utils import ConfigManager
+from .verbs.verb_registry import all_class_verbs, fetch_verb_class, is_verb_class
 
 
 class Fibad:
     """
     Overall class that represents an interface into fibad. Currently this encapsulates a configuration and is
-    the external interface to all verbs in a programmatic context.
+    the external interface to all verbs in a programmatic or notebook context.
 
     CLI functions in fibad_cli are implemented by calling this class
     """
 
-    verbs = ["train", "infer", "download", "prepare", "rebuild_manifest"]
-
     def __init__(self, *, config_file: Optional[Union[Path, str]] = None, setup_logging: bool = True):
         """Initialize fibad. Always applies the default config, and merges it with any provided config file.
 
@@ -193,3 +192,23 @@ def rebuild_manifest(self, **kwargs):
         from .rebuild_manifest import run
 
         return run(config=self.config, **kwargs)
+
+    # Python notebook interface to class verbs
+    # we need both __dir__ and __getattr__ so that the
+    # functions from the various verb classes appear to be
+    # methods on the fibad object
+    def __dir__(self):
+        return sorted(dir(Fibad) + list(self.__dict__.keys()) + all_class_verbs())
+
+    def __getattr__(self, name):
+        if not is_verb_class(name):
+            return None
+
+        # We return the run function on the verb class after
+        # just-in-time creating the verb so that a notebook user
+        # sees the function signature and help.
+        #
+        # It may be possible to do this with functools.partial techniques
+        # but should be tested.
+        verb_inst = fetch_verb_class(name)(config=self.config)
+        return verb_inst.run
diff --git a/src/fibad/fibad_default_config.toml b/src/fibad/fibad_default_config.toml
@@ -192,3 +192,7 @@ split = false
 
 # Whether to generate a chromadb vector database of inference results
 chromadb = true
+
+[results]
+# Path to inference results to use for visualization and lookups. Uses latest inference run if none provided.
+inference_dir = false
diff --git a/src/fibad/infer.py b/src/fibad/infer.py
@@ -54,6 +54,7 @@ def run(config: ConfigDict):
             },
         )
 
+    # These are values the _save_batch callback needs to run
     write_index = 0
     batch_index = 0
     object_ids: list[int] = []
@@ -93,18 +94,49 @@ def _save_batch(batch_results: Tensor):
         filename = f"batch_{batch_index}.npy"
         savepath = results_dir / filename
         if savepath.exists():
-            RuntimeError("The path to save results for object {object_id} already exists.")
+            RuntimeError(f"The path to save results for objects in batch {batch_index} already exists.")
 
         np.save(savepath, structured_batch, allow_pickle=False)
 
         batch_index += 1
         write_index += batch_len
 
+    # Run inference
     evaluator = create_evaluator(model, _save_batch)
     evaluator.run(data_loader)
 
-    logger.info(f"Results saved in {results_dir}")
-    logger.info("finished evaluating...")
+    # Write out a dictionary to map IDs->Batch
+    batch_size = config["data_loader"]["batch_size"]
+    batch_nums = np.array([np.full(batch_size, i) for i in range(0, batch_index)]).ravel()
+    save_batch_index(results_dir, np.array(object_ids), batch_nums[: len(object_ids)])
+
+    # Log completion
+    logger.info(f"Inference Results saved in {results_dir}")
+
+
+def save_batch_index(results_dir: Path, ids: np.ndarray, batch_nums: np.ndarray):
+    """Save a batch index in the result directory provided
+
+    Parameters
+    ----------
+    results_dir : Path
+        The results directory
+    ids : np.ndarray
+        All IDs to write out.
+    batch_nums : np.ndarray
+        The corresponding batch numbers for the IDs provided.
+    """
+    batch_index_dtype = np.dtype([("id", np.int64), ("batch_num", np.int64)])
+    batch_index = np.zeros(len(ids), batch_index_dtype)
+    batch_index["id"] = np.array(ids)
+    batch_index["batch_num"] = np.array(batch_nums)
+    batch_index.sort(order="id")
+
+    filename = "batch_index.npy"
+    savepath = results_dir / filename
+    if savepath.exists():
+        RuntimeError("The path to save batch index already exists.")
+    np.save(savepath, batch_index, allow_pickle=False)
 
 
 def load_model_weights(config: ConfigDict, model):

diff --git a/src/fibad/verbs/__init__.py b/src/fibad/verbs/__init__.py
@@ -0,0 +1,3 @@
+from .verb_registry import all_class_verbs, all_verbs, fetch_verb_class, is_verb_class
+
+__all__ = ["VERB_REGISTRY", "is_verb_class", "fetch_verb_class", "all_class_verbs", "all_verbs"]
diff --git a/src/fibad/verbs/lookup.py b/src/fibad/verbs/lookup.py
@@ -0,0 +1,137 @@
+import logging
+import re
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Optional, Union
+
+import numpy as np
+
+from fibad.config_utils import find_most_recent_results_dir
+from fibad.infer import save_batch_index
+
+from .verb_registry import Verb, fibad_verb
+
+logger = logging.getLogger(__name__)
+
+
+@fibad_verb
+class Lookup(Verb):
+    """Look up an inference result using the ID of a data member"""
+
+    cli_name = "lookup"
+    add_parser_kwargs = {}
+
+    @staticmethod
+    def setup_parser(parser: ArgumentParser):
+        """Set up our arguments by configuring a subparser
+
+        Parameters
+        ----------
+        parser : ArgumentParser
+            The sub-parser to configure
+        """
+        parser.add_argument("-i", "--id", type=str, required=True, help="ID of image")
+        parser.add_argument(
+            "-r", "--results-dir", type=str, required=False, help="Directory containing inference results."
+        )
+
+    def run_cli(self, args: Optional[Namespace] = None):
+        """Entrypoint to Lookup from the CLI.
+
+        Parameters
+        ----------
+        args : Optional[Namespace], optional
+            The parsed command line arguments
+
+        """
+        logger.info("Lookup run from cli")
+        if args is None:
+            raise RuntimeError("Run CLI called with no arguments.")
+        # This is where we map from CLI parsed args to a
+        # self.run (args) call.
+        vector = self.run(id=args.id, results_dir=args.results_dir)
+        if vector is None:
+            logger.info("No inference result found")
+        else:
+            logger.info("Inference result found")
+            print(vector)
+
+    def run(self, id: str, results_dir: Optional[Union[Path, str]]) -> Optional[np.ndarray]:
+        """Lookup the latent-space representation of a particular ID
+
+        Requires the relevant dataset to be configured, and for inference to have been run.
+
+        Parameters
+        ----------
+        id : str
+            The ID of the input data to look up the inference result
+
+        results_dir : str, Optional
+            The directory containing the inference results.
+
+        Returns
+        -------
+        Optional[np.ndarray]
+            The output tensor of the model for the given input.
+        """
+        if results_dir is None:
+            if self.config["results"]["inference_dir"]:
+                results_dir = self.config["results"]["inference_dir"]
+            else:
+                results_dir = find_most_recent_results_dir(self.config, verb="infer")
+                msg = f"Using most recent results dir {results_dir} for lookup."
+                msg += "Use the [results] inference_dir config to set a directory or pass it to this verb."
+                logger.info(msg)
+
+        if results_dir is None:
+            msg = "Could not find a results directory. Run infer or use "
+            msg += "[results] inference_dir config to specify a directory"
+            logger.error(msg)
+            return None
+
+        if isinstance(results_dir, str):
+            results_dir = Path(results_dir)
+
+        # Open the batch index numpy file.
+        # Loop over files and create if it does not exist
+        batch_index_path = results_dir / "batch_index.npy"
+        if not batch_index_path.exists():
+            self.create_index(results_dir)
+
+        batch_index = np.load(results_dir / "batch_index.npy")
+        batch_num = batch_index[batch_index["id"] == int(id)]["batch_num"]
+        if len(batch_num) == 0:
+            return None
+        batch_num = batch_num[0]
+
+        recarray = np.load(results_dir / f"batch_{batch_num}.npy")
+        tensor = recarray[recarray["id"] == int(id)]["tensor"]
+        if len(tensor) == 0:
+            return None
+
+        return np.array(tensor[0])
+
+    def create_index(self, results_dir: Path):
+        """Recreate the index into the batch numpy files
+
+        Parameters
+        ----------
+        results_dir : Path
+            Path to the batch numpy files
+        """
+        ids = []
+        batch_nums = []
+        # Use the batched numpy files to assemble an index.
+        logger.info("Recreating index...")
+        for file in results_dir.glob("batch_*.npy"):
+            print(".", end="", flush=True)
+            m = re.match(r"batch_([0-9]+).npy", file.name)
+            if m is None:
+                logger.warn(f"Could not find batch number for {file}")
+                continue
+            batch_num = int(m[1])
+            recarray = np.load(file)
+            ids += list(recarray["id"])
+            batch_nums += [batch_num] * len(recarray["id"])
+
+        save_batch_index(results_dir, np.array(ids), np.array(batch_nums))
diff --git a/src/fibad/verbs/search.py b/src/fibad/verbs/search.py
@@ -0,0 +1,49 @@
+import logging
+from argparse import ArgumentParser, Namespace
+from typing import Optional
+
+from .verb_registry import Verb, fibad_verb
+
+logger = logging.getLogger(__name__)
+
+
+@fibad_verb
+class Search(Verb):
+    """Stub of similarity search"""
+
+    cli_name = "search"
+    add_parser_kwargs = {}
+
+    @staticmethod
+    def setup_parser(parser: ArgumentParser):
+        """Stub of parser setup"""
+        parser.add_argument("-i", "--image-file", type=str, help="Path to image file", required=True)
+
+    # If both of these move to the verb superclass then a new verb is basically
+    #
+    # If you want no args, just make the class, define run(self)
+    # If you want args
+    #     1) write setup_parser (which sets up for ArgumentParser and name/type info for cli run)
+    #     2) write run(self, <your args>) to do what you want
+    #
+
+    # Should there be a version of this on the base class which uses a dict on the Verb
+    # superclass to build the call to run based on what the subclass verb defined in setup_parser
+    def run_cli(self, args: Optional[Namespace] = None):
+        """Stub CLI implementation"""
+        logger.info("Search run from cli")
+        if args is None:
+            raise RuntimeError("Run CLI called with no arguments.")
+        # This is where we map from CLI parsed args to a
+        # self.run (args) call.
+        return self.run(image_file=args.image_file)
+
+    def run(self, image_file: str):
+        """Search for... todo
+
+        Parameters
+        ----------
+        image_file : str
+            _description_
+        """
+        logger.info(f"Got Image {image_file}")
diff --git a/src/fibad/verbs/verb_registry.py b/src/fibad/verbs/verb_registry.py
@@ -0,0 +1,82 @@
+import logging
+from abc import ABC
+from typing import Optional
+
+from fibad.config_utils import ConfigDict
+from fibad.plugin_utils import update_registry
+
+logger = logging.getLogger(__name__)
+
+
+class Verb(ABC):
+    """Base class for all fibad verbs"""
+
+    # Verbs get to define how their parser gets added to the main parser
+    # This is given in case verbs do not define any keyword args for
+    # subparser.add_parser()
+    add_parser_kwargs: dict[str, str] = {}
+
+    def __init__(self, config: ConfigDict):
+        """Overall initialization for all verbs that saves the config"""
+        self.config = config
+
+
+# Verbs with no class are assumed to have a function in fibad.py which
+# performs their function. All other verbs should be defined by named classes
+# in fibad.verbs and use the @fibad_verb decorator
+VERB_REGISTRY: dict[str, Optional[type[Verb]]] = {
+    "train": None,
+    "infer": None,
+    "download": None,
+    "prepare": None,
+    "rebuild_manifest": None,
+}
+
+
+def fibad_verb(cls: type[Verb]) -> type[Verb]:
+    """Decorator to Register a fibad verb"""
+    update_registry(VERB_REGISTRY, cls.cli_name, cls)  # type: ignore[attr-defined]
+    return cls
+
+
+def all_verbs() -> list[str]:
+    """Returns All verbs that are currently registered"""
+    return [verb for verb in VERB_REGISTRY]
+
+
+def all_class_verbs() -> list[str]:
+    """Returns All verbs that are currently registered with a class-based implementation"""
+    return [verb for verb in VERB_REGISTRY if VERB_REGISTRY.get(verb) is not None]
+
+
+def is_verb_class(cli_name: str) -> bool:
+    """Returns true if the verb has a class based implementation
+
+    Parameters
+    ----------
+    cli_name : str
+        The name of the verb on the command line interface
+
+    Returns
+    -------
+    bool
+        True if the verb has a class-based implementation
+    """
+    return cli_name in VERB_REGISTRY and VERB_REGISTRY.get(cli_name) is not None
+
+
+def fetch_verb_class(cli_name: str) -> Optional[type[Verb]]:
+    """Gives the class object for the named verb
+
+    Parameters
+    ----------
+    cli_name : str
+        The name of the verb on the command line interface
+
+
+    Returns
+    -------
+    Optional[type[Verb]]
+        The verb class or None if no such verb class exists.
+    """
+    return VERB_REGISTRY.get(cli_name)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .verb_registry import all_class_verbs, all_verbs, fetch_verb_class, is_verb_class

		__all__ = ["VERB_REGISTRY", "is_verb_class", "fetch_verb_class", "all_class_verbs", "all_verbs"]