From c38b5bf1d351a97bd30f9d7d781906247c12f8c0 Mon Sep 17 00:00:00 2001 From: Michael Tauraso Date: Fri, 17 Jan 2025 17:24:05 -0800 Subject: [PATCH] Add lookup verb and verbs-taking-arguments infrastructure - Verbs can now be defined with a class that encapsulates the cli and notebook interfaces. - Verbs use a registry similar to models and datasets, but without support for external verbs - Lookup verb with a CLI and notebook interface - Inference now creates an index of object_id -> batch to facilitate lookups in the batch numpy files - Backwards compatibility: Lookup verb will regenerate the object_id-> batch index if it does not exist. - Stub implementation of similarity search verb. --- src/fibad/fibad.py | 25 ++++- src/fibad/fibad_default_config.toml | 4 + src/fibad/infer.py | 38 +++++++- src/fibad/verbs/__init__.py | 3 + src/fibad/verbs/lookup.py | 137 ++++++++++++++++++++++++++++ src/fibad/verbs/search.py | 49 ++++++++++ src/fibad/verbs/verb_registry.py | 82 +++++++++++++++++ src/fibad_cli/main.py | 35 ++++++- 8 files changed, 364 insertions(+), 9 deletions(-) create mode 100644 src/fibad/verbs/__init__.py create mode 100644 src/fibad/verbs/lookup.py create mode 100644 src/fibad/verbs/search.py create mode 100644 src/fibad/verbs/verb_registry.py diff --git a/src/fibad/fibad.py b/src/fibad/fibad.py index 43c09ddc..dc882d7a 100644 --- a/src/fibad/fibad.py +++ b/src/fibad/fibad.py @@ -4,18 +4,17 @@ from typing import Optional, Union from .config_utils import ConfigManager +from .verbs.verb_registry import all_class_verbs, fetch_verb_class, is_verb_class class Fibad: """ Overall class that represents an interface into fibad. Currently this encapsulates a configuration and is - the external interface to all verbs in a programmatic context. + the external interface to all verbs in a programmatic or notebook context. CLI functions in fibad_cli are implemented by calling this class """ - verbs = ["train", "infer", "download", "prepare", "rebuild_manifest"] - def __init__(self, *, config_file: Optional[Union[Path, str]] = None, setup_logging: bool = True): """Initialize fibad. Always applies the default config, and merges it with any provided config file. @@ -193,3 +192,23 @@ def rebuild_manifest(self, **kwargs): from .rebuild_manifest import run return run(config=self.config, **kwargs) + + # Python notebook interface to class verbs + # we need both __dir__ and __getattr__ so that the + # functions from the various verb classes appear to be + # methods on the fibad object + def __dir__(self): + return sorted(dir(Fibad) + list(self.__dict__.keys()) + all_class_verbs()) + + def __getattr__(self, name): + if not is_verb_class(name): + return None + + # We return the run function on the verb class after + # just-in-time creating the verb so that a notebook user + # sees the function signature and help. + # + # It may be possible to do this with functools.partial techniques + # but should be tested. + verb_inst = fetch_verb_class(name)(config=self.config) + return verb_inst.run diff --git a/src/fibad/fibad_default_config.toml b/src/fibad/fibad_default_config.toml index 42a51cb0..d691aa27 100644 --- a/src/fibad/fibad_default_config.toml +++ b/src/fibad/fibad_default_config.toml @@ -192,3 +192,7 @@ split = false # Whether to generate a chromadb vector database of inference results chromadb = true + +[results] +# Path to inference results to use for visualization and lookups. Uses latest inference run if none provided. +inference_dir = false \ No newline at end of file diff --git a/src/fibad/infer.py b/src/fibad/infer.py index 305a04d0..a9edadb5 100644 --- a/src/fibad/infer.py +++ b/src/fibad/infer.py @@ -54,6 +54,7 @@ def run(config: ConfigDict): }, ) + # These are values the _save_batch callback needs to run write_index = 0 batch_index = 0 object_ids: list[int] = [] @@ -93,18 +94,49 @@ def _save_batch(batch_results: Tensor): filename = f"batch_{batch_index}.npy" savepath = results_dir / filename if savepath.exists(): - RuntimeError("The path to save results for object {object_id} already exists.") + RuntimeError(f"The path to save results for objects in batch {batch_index} already exists.") np.save(savepath, structured_batch, allow_pickle=False) batch_index += 1 write_index += batch_len + # Run inference evaluator = create_evaluator(model, _save_batch) evaluator.run(data_loader) - logger.info(f"Results saved in {results_dir}") - logger.info("finished evaluating...") + # Write out a dictionary to map IDs->Batch + batch_size = config["data_loader"]["batch_size"] + batch_nums = np.array([np.full(batch_size, i) for i in range(0, batch_index)]).ravel() + save_batch_index(results_dir, np.array(object_ids), batch_nums[: len(object_ids)]) + + # Log completion + logger.info(f"Inference Results saved in {results_dir}") + + +def save_batch_index(results_dir: Path, ids: np.ndarray, batch_nums: np.ndarray): + """Save a batch index in the result directory provided + + Parameters + ---------- + results_dir : Path + The results directory + ids : np.ndarray + All IDs to write out. + batch_nums : np.ndarray + The corresponding batch numbers for the IDs provided. + """ + batch_index_dtype = np.dtype([("id", np.int64), ("batch_num", np.int64)]) + batch_index = np.zeros(len(ids), batch_index_dtype) + batch_index["id"] = np.array(ids) + batch_index["batch_num"] = np.array(batch_nums) + batch_index.sort(order="id") + + filename = "batch_index.npy" + savepath = results_dir / filename + if savepath.exists(): + RuntimeError("The path to save batch index already exists.") + np.save(savepath, batch_index, allow_pickle=False) def load_model_weights(config: ConfigDict, model): diff --git a/src/fibad/verbs/__init__.py b/src/fibad/verbs/__init__.py new file mode 100644 index 00000000..dc1a2855 --- /dev/null +++ b/src/fibad/verbs/__init__.py @@ -0,0 +1,3 @@ +from .verb_registry import all_class_verbs, all_verbs, fetch_verb_class, is_verb_class + +__all__ = ["VERB_REGISTRY", "is_verb_class", "fetch_verb_class", "all_class_verbs", "all_verbs"] diff --git a/src/fibad/verbs/lookup.py b/src/fibad/verbs/lookup.py new file mode 100644 index 00000000..ff383445 --- /dev/null +++ b/src/fibad/verbs/lookup.py @@ -0,0 +1,137 @@ +import logging +import re +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Optional, Union + +import numpy as np + +from fibad.config_utils import find_most_recent_results_dir +from fibad.infer import save_batch_index + +from .verb_registry import Verb, fibad_verb + +logger = logging.getLogger(__name__) + + +@fibad_verb +class Lookup(Verb): + """Look up an inference result using the ID of a data member""" + + cli_name = "lookup" + add_parser_kwargs = {} + + @staticmethod + def setup_parser(parser: ArgumentParser): + """Set up our arguments by configuring a subparser + + Parameters + ---------- + parser : ArgumentParser + The sub-parser to configure + """ + parser.add_argument("-i", "--id", type=str, required=True, help="ID of image") + parser.add_argument( + "-r", "--results-dir", type=str, required=False, help="Directory containing inference results." + ) + + def run_cli(self, args: Optional[Namespace] = None): + """Entrypoint to Lookup from the CLI. + + Parameters + ---------- + args : Optional[Namespace], optional + The parsed command line arguments + + """ + logger.info("Lookup run from cli") + if args is None: + raise RuntimeError("Run CLI called with no arguments.") + # This is where we map from CLI parsed args to a + # self.run (args) call. + vector = self.run(id=args.id, results_dir=args.results_dir) + if vector is None: + logger.info("No inference result found") + else: + logger.info("Inference result found") + print(vector) + + def run(self, id: str, results_dir: Optional[Union[Path, str]]) -> Optional[np.ndarray]: + """Lookup the latent-space representation of a particular ID + + Requires the relevant dataset to be configured, and for inference to have been run. + + Parameters + ---------- + id : str + The ID of the input data to look up the inference result + + results_dir : str, Optional + The directory containing the inference results. + + Returns + ------- + Optional[np.ndarray] + The output tensor of the model for the given input. + """ + if results_dir is None: + if self.config["results"]["inference_dir"]: + results_dir = self.config["results"]["inference_dir"] + else: + results_dir = find_most_recent_results_dir(self.config, verb="infer") + msg = f"Using most recent results dir {results_dir} for lookup." + msg += "Use the [results] inference_dir config to set a directory or pass it to this verb." + logger.info(msg) + + if results_dir is None: + msg = "Could not find a results directory. Run infer or use " + msg += "[results] inference_dir config to specify a directory" + logger.error(msg) + return None + + if isinstance(results_dir, str): + results_dir = Path(results_dir) + + # Open the batch index numpy file. + # Loop over files and create if it does not exist + batch_index_path = results_dir / "batch_index.npy" + if not batch_index_path.exists(): + self.create_index(results_dir) + + batch_index = np.load(results_dir / "batch_index.npy") + batch_num = batch_index[batch_index["id"] == int(id)]["batch_num"] + if len(batch_num) == 0: + return None + batch_num = batch_num[0] + + recarray = np.load(results_dir / f"batch_{batch_num}.npy") + tensor = recarray[recarray["id"] == int(id)]["tensor"] + if len(tensor) == 0: + return None + + return np.array(tensor[0]) + + def create_index(self, results_dir: Path): + """Recreate the index into the batch numpy files + + Parameters + ---------- + results_dir : Path + Path to the batch numpy files + """ + ids = [] + batch_nums = [] + # Use the batched numpy files to assemble an index. + logger.info("Recreating index...") + for file in results_dir.glob("batch_*.npy"): + print(".", end="", flush=True) + m = re.match(r"batch_([0-9]+).npy", file.name) + if m is None: + logger.warn(f"Could not find batch number for {file}") + continue + batch_num = int(m[1]) + recarray = np.load(file) + ids += list(recarray["id"]) + batch_nums += [batch_num] * len(recarray["id"]) + + save_batch_index(results_dir, np.array(ids), np.array(batch_nums)) diff --git a/src/fibad/verbs/search.py b/src/fibad/verbs/search.py new file mode 100644 index 00000000..860009af --- /dev/null +++ b/src/fibad/verbs/search.py @@ -0,0 +1,49 @@ +import logging +from argparse import ArgumentParser, Namespace +from typing import Optional + +from .verb_registry import Verb, fibad_verb + +logger = logging.getLogger(__name__) + + +@fibad_verb +class Search(Verb): + """Stub of similarity search""" + + cli_name = "search" + add_parser_kwargs = {} + + @staticmethod + def setup_parser(parser: ArgumentParser): + """Stub of parser setup""" + parser.add_argument("-i", "--image-file", type=str, help="Path to image file", required=True) + + # If both of these move to the verb superclass then a new verb is basically + # + # If you want no args, just make the class, define run(self) + # If you want args + # 1) write setup_parser (which sets up for ArgumentParser and name/type info for cli run) + # 2) write run(self, ) to do what you want + # + + # Should there be a version of this on the base class which uses a dict on the Verb + # superclass to build the call to run based on what the subclass verb defined in setup_parser + def run_cli(self, args: Optional[Namespace] = None): + """Stub CLI implementation""" + logger.info("Search run from cli") + if args is None: + raise RuntimeError("Run CLI called with no arguments.") + # This is where we map from CLI parsed args to a + # self.run (args) call. + return self.run(image_file=args.image_file) + + def run(self, image_file: str): + """Search for... todo + + Parameters + ---------- + image_file : str + _description_ + """ + logger.info(f"Got Image {image_file}") diff --git a/src/fibad/verbs/verb_registry.py b/src/fibad/verbs/verb_registry.py new file mode 100644 index 00000000..e6703437 --- /dev/null +++ b/src/fibad/verbs/verb_registry.py @@ -0,0 +1,82 @@ +import logging +from abc import ABC +from typing import Optional + +from fibad.config_utils import ConfigDict +from fibad.plugin_utils import update_registry + +logger = logging.getLogger(__name__) + + +class Verb(ABC): + """Base class for all fibad verbs""" + + # Verbs get to define how their parser gets added to the main parser + # This is given in case verbs do not define any keyword args for + # subparser.add_parser() + add_parser_kwargs: dict[str, str] = {} + + def __init__(self, config: ConfigDict): + """Overall initialization for all verbs that saves the config""" + self.config = config + + +# Verbs with no class are assumed to have a function in fibad.py which +# performs their function. All other verbs should be defined by named classes +# in fibad.verbs and use the @fibad_verb decorator +VERB_REGISTRY: dict[str, Optional[type[Verb]]] = { + "train": None, + "infer": None, + "download": None, + "prepare": None, + "rebuild_manifest": None, +} + + +def fibad_verb(cls: type[Verb]) -> type[Verb]: + """Decorator to Register a fibad verb""" + update_registry(VERB_REGISTRY, cls.cli_name, cls) # type: ignore[attr-defined] + return cls + + +def all_verbs() -> list[str]: + """Returns All verbs that are currently registered""" + return [verb for verb in VERB_REGISTRY] + + +def all_class_verbs() -> list[str]: + """Returns All verbs that are currently registered with a class-based implementation""" + return [verb for verb in VERB_REGISTRY if VERB_REGISTRY.get(verb) is not None] + + +def is_verb_class(cli_name: str) -> bool: + """Returns true if the verb has a class based implementation + + Parameters + ---------- + cli_name : str + The name of the verb on the command line interface + + Returns + ------- + bool + True if the verb has a class-based implementation + """ + return cli_name in VERB_REGISTRY and VERB_REGISTRY.get(cli_name) is not None + + +def fetch_verb_class(cli_name: str) -> Optional[type[Verb]]: + """Gives the class object for the named verb + + Parameters + ---------- + cli_name : str + The name of the verb on the command line interface + + + Returns + ------- + Optional[type[Verb]] + The verb class or None if no such verb class exists. + """ + return VERB_REGISTRY.get(cli_name) diff --git a/src/fibad_cli/main.py b/src/fibad_cli/main.py index 899ff558..ad9cdb9a 100644 --- a/src/fibad_cli/main.py +++ b/src/fibad_cli/main.py @@ -3,11 +3,12 @@ from importlib.metadata import version from fibad import Fibad +from fibad.verbs import all_verbs, fetch_verb_class, is_verb_class def main(): """Primary entry point for the Fibad CLI. This handles dispatching to the various - Fibad actions. + Fibad actions and returning a result. """ description = "Fibad CLI" @@ -18,7 +19,28 @@ def main(): parser.add_argument("--version", dest="version", action="store_true", help="Show version") parser.add_argument("-c", "--runtime-config", type=str, help="Full path to runtime config file") - parser.add_argument("verb", nargs="?", choices=Fibad.verbs, help="Verb to execute") + # cut off "usage: " from beginning and "\n" from end so we get an invocation + # which subcommand parsers can add to appropriately. + subparser_usage_prefix = parser.format_usage()[7:-1] + subparsers = parser.add_subparsers(title="Verbs:", required=False) + + # Add a subparser for every verb, (whether defined by function or class) + for cli_name in all_verbs(): + print(cli_name) + subparser_kwargs = {} + + if is_verb_class(cli_name): + verb_class = fetch_verb_class(cli_name) + subparser_kwargs = verb_class.add_parser_kwargs + + verb_parser = subparsers.add_parser( + cli_name, prog=subparser_usage_prefix + " " + cli_name, **subparser_kwargs + ) + + if is_verb_class(cli_name): + verb_class.setup_parser(verb_parser) + + verb_parser.set_defaults(verb=cli_name) args = parser.parse_args() @@ -31,7 +53,14 @@ def main(): sys.exit(1) fibad_instance = Fibad(config_file=args.runtime_config) - getattr(fibad_instance, args.verb)() + retval = 0 + if is_verb_class(args.verb): + verb = fetch_verb_class(cli_name)(fibad_instance.config) + retval = verb.run_cli(args) + else: + getattr(fibad_instance, args.verb)() + + exit(retval) if __name__ == "__main__":