Add ability to gather and analyse some model metadata (nod-ai#376)

zjgarvey · web-flow · commit dc15d532925b · 2024-10-23T13:01:15.000-07:00
## Usage:

When running `python run.py &lt;other-args&gt; --get-metadata`, this will save
a dictionary with the model size and op frequencies to the log
directory.

After a run, you can use `python utils/find_duplicate_models.py` to save
or print a json dump of redundant models.

### Options:
- "-s" "--simplified" will only return the list of model names (doesn't
include the corresponding metadata).
- "-o" "--output" allows specifying the name of a json file you want to
save the result to.
- "-r" "--rundirectory" allows specifying a different run directory to
search, if `run.py` was run with a non-default run directory arg.

## Sample:

I saved the tests below to a file called `sample.txt`.

```
add_test
model--bart-base-booksum--KamilAin
model--bart-base-cnn--ainize
model--bart-base-few-shot-k-1024-finetuned-squad-seed-2--anas-awadalla
model--bart-base-few-shot-k-1024-finetuned-squad-seed-4--anas-awadalla
```

With a clean `test-run` directory, I ran 
```shell
python run.py --testsfile=sample.txt --stages "setup" --get-metadata
```

The result of running 
```shell
python utils/find_duplicate_models.py -s
```
was:

```json
[
    [
        "model--bart-base-booksum--KamilAin",
        "model--bart-base-cnn--ainize"
    ],
    [
        "model--bart-base-few-shot-k-1024-finetuned-squad-seed-4--anas-awadalla",
        "model--bart-base-few-shot-k-1024-finetuned-squad-seed-2--anas-awadalla"
    ]
]
```

and without the `-s` arg, it includes the metadata for each grouping:

```json
[
    {
        "models": [
            "model--bart-base-booksum--KamilAin",
            "model--bart-base-cnn--ainize"
        ],
        "shared_metadata": {
            "model_size": 712772272,
            "op_frequency": {
                "Add": 227,
                "Cast": 13,
                "Concat": 188,
                "Constant": 886,
                "ConstantOfShape": 6,
                "Div": 44,
                "Equal": 5,
                "Erf": 12,
                "Expand": 5,
                "Gather": 64,
                "Less": 1,
                "MatMul": 133,
                "Mul": 99,
                "Pow": 32,
                "Range": 3,
                "ReduceMean": 64,
                "Reshape": 187,
                "Shape": 67,
                "Slice": 2,
                "Softmax": 18,
                "Sqrt": 32,
                "Squeeze": 2,
                "Sub": 35,
                "Transpose": 90,
                "Unsqueeze": 325,
                "Where": 8
            }
        }
    },
    {
        "models": [
            "model--bart-base-few-shot-k-1024-finetuned-squad-seed-4--anas-awadalla",
            "model--bart-base-few-shot-k-1024-finetuned-squad-seed-2--anas-awadalla"
        ],
        "shared_metadata": {
            "model_size": 558176646,
            "op_frequency": {
                "Add": 229,
                "Cast": 17,
                "Concat": 193,
                "Constant": 937,
                "ConstantOfShape": 12,
                "Div": 44,
                "Equal": 10,
                "Erf": 12,
                "Expand": 11,
                "Gather": 70,
                "Less": 1,
                "MatMul": 133,
                "Mul": 103,
                "Pow": 32,
                "Range": 6,
                "ReduceMean": 64,
                "Reshape": 191,
                "ScatterND": 2,
                "Shape": 83,
                "Slice": 7,
                "Softmax": 18,
                "Split": 1,
                "Sqrt": 32,
                "Squeeze": 4,
                "Sub": 35,
                "Transpose": 90,
                "Unsqueeze": 333,
                "Where": 13
            }
        }
    }
]
```
diff --git a/alt_e2eshark/e2e_testing/framework.py b/alt_e2eshark/e2e_testing/framework.py
@@ -128,6 +128,14 @@ def update_opset_version_and_overwrite(self):
             og_model, self.opset_version
         )
         onnx.save(model, self.model)
+    
+    def get_metadata(self):
+        model_size = os.path.getsize(self.model)
+        freq = get_op_frequency(self.model)
+        metadata = {"model_size" : model_size, "op_frequency" : freq}
+        return metadata
+
+
 
 # TODO: extend TestModel to a union, or make TestModel a base class when supporting other frontends
 TestModel = OnnxModelInfo 
@@ -161,6 +169,7 @@ def benchmark(self, artifact: CompiledOutput, input: TestTensors, repetitions: i
         """returns a float representing inference time in ms"""
         pass
 
+
 class Test(NamedTuple):
     """Used to store the name and TestInfo constructor for a registered test"""
 
diff --git a/alt_e2eshark/e2e_testing/logging_utils.py b/alt_e2eshark/e2e_testing/logging_utils.py
@@ -71,7 +71,7 @@ def scan_dir_del_not_logs(dir):
     for root, dirs, files in os.walk(dir):
         for name in files:
             curr_file = os.path.join(root, name)
-            if not name.endswith(".log") and name != "benchmark.json":
+            if not name.endswith(".log") and not name.endswith(".json"):
                 removed_files.append(curr_file)
     for file in removed_files:
         os.remove(file)
diff --git a/alt_e2eshark/run.py b/alt_e2eshark/run.py
@@ -133,6 +133,7 @@ def main(args):
         stages,
         args.load_inputs,
         int(args.cleanup),
+        args.get_metadata,
     )
 
     if args.report:
@@ -142,7 +143,7 @@ def main(args):
 
 
 def run_tests(
-    test_list: List[Test], config: TestConfig, parent_log_dir: str, no_artifacts: bool, verbose: bool, stages: List[str], load_inputs: bool, cleanup: int,
+    test_list: List[Test], config: TestConfig, parent_log_dir: str, no_artifacts: bool, verbose: bool, stages: List[str], load_inputs: bool, cleanup: int, get_metadata=bool,
 ) -> Dict[str, Dict]:
     """runs tests in test_list based on config. Returns a dictionary containing the test statuses."""
     # TODO: multi-process
@@ -190,6 +191,10 @@ def run_tests(
                 # TODO: Figure out how to factor this out of run.py
                 if not os.path.exists(inst.model):
                     inst.construct_model()
+                if get_metadata:
+                    metadata = inst.get_metadata()
+                    metadata_file = Path(log_dir) / "metadata.json"
+                    save_dict(metadata, metadata_file)
             
             artifact_save_to = None if no_artifacts else log_dir
             # generate mlir from the instance using the config
@@ -449,6 +454,12 @@ def _get_argparse():
         default="report.md",
         help="output filename for the report summary.",
     )
+    parser.add_argument(
+        "--get-metadata",
+        action="store_true",
+        default=False,
+        help="save some model metadata to log_dir/metadata.json"
+    )
     # parser.add_argument(
     #     "-d",
     #     "--todtype",
diff --git a/alt_e2eshark/utils/find_duplicate_models.py b/alt_e2eshark/utils/find_duplicate_models.py
@@ -0,0 +1,100 @@
+from pathlib import Path
+import argparse
+from typing import Union, Dict, Any, Optional
+import json
+import io
+
+ROOT = Path(__file__).parents[1]
+
+
+class HashableDict(dict):
+    """a hashable dictionary, used to invert a dictionary with dictionary values"""
+
+    def __hash__(self):
+        return hash(tuple(sorted(self.items())))
+
+
+def load_json_dict(filepath: Union[str, Path]) -> Dict[str, Any]:
+    with open(filepath) as contents:
+        loaded_dict = json.load(contents)
+    return loaded_dict
+
+
+def save_to_json(jsonable_object, name_json: Optional[str] = None):
+    """Saves an object to a json file with the given name, or prints result."""
+    dict_str = json.dumps(
+        jsonable_object,
+        indent=4,
+        sort_keys=True,
+        separators=(",", ": "),
+        ensure_ascii=False,
+    )
+    if not name_json:
+        print(dict_str)
+        return
+    path_json = ROOT / f"{name_json.stem}.json"
+    with io.open(path_json, "w", encoding="utf8") as outfile:
+        outfile.write(dict_str)
+
+
+def get_groupings(metadata_dicts: Dict[str, Dict]) -> Dict:
+    """gets a multi-valued inverse of metatdata_dicts"""
+    groupings = dict()
+    for key, value in metadata_dicts.items():
+        value["op_frequency"] = HashableDict(value["op_frequency"])
+        hashable = HashableDict(value)
+        if hashable in groupings.keys():
+            groupings[hashable].append(key)
+        else:
+            groupings[hashable] = [key]
+    return groupings
+
+
+def main(args):
+    run_dir = ROOT / args.rundirectory
+    metadata_dicts = dict()
+    for x in run_dir.glob("*/*.json"):
+        if x.name == "metadata.json":
+            test_name = x.parent.name
+            metadata_dicts[test_name] = load_json_dict(x)
+
+    groupings = get_groupings(metadata_dicts)
+    found_redundancies = []
+    for key, value in groupings.items():
+        if len(value) > 1:
+            found_redundancies.append(
+                value if args.simplified else {"models": value, "shared_metadata": key}
+            )
+    save_to_json(found_redundancies, args.output)
+
+
+def _get_argparse():
+    msg = "After running run.py with the flag --get-metadata, use this tool to find duplicate models."
+    parser = argparse.ArgumentParser(
+        prog="find_duplicate_models.py", description=msg, epilog=""
+    )
+
+    parser.add_argument(
+        "-r",
+        "--rundirectory",
+        default="test-run",
+        help="The directory containing run.py results",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="specify an output json file",
+    )
+    parser.add_argument(
+        "-s",
+        "--simplified",
+        action="store_true",
+        default=False,
+        help="pass this arg to only print redundant model lists, without the corresponding metadata.",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = _get_argparse()
+    main(parser.parse_args())