mlperf

mila-iqia · Jun 8, 2023 · 42eab04 · 42eab04
1 parent b5539ea
commit 42eab04
Show file tree

Hide file tree

Showing 9 changed files with 59 additions and 9 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -49,15 +49,19 @@ jobs:
       MILABENCH_DASH: "no"
 
     steps:
-      - uses: actions/checkout@v3
-
       - uses: conda-incubator/setup-miniconda@v2
         with:
           auto-activate-base: false
           python-version: 3.9
           miniconda-version: "latest"
           activate-environment: test
 
+      - name: clean
+        run: |
+          python -c "import shutil; shutil.rmtree('.')"
+
+      - uses: actions/checkout@v3
+
       - name: Pytorch Sanity
         run: |
           if [[ "${{ matrix.arch }}" == "rocm" ]]; then

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "benchmarks/mlperf/apex"]
+	path = benchmarks/mlperf/apex
+	url = https://github.com/NVIDIA/apex.git
+
+[submodule "benchmarks/mlperf/training_results_v2.1"]
+	path = benchmarks/mlperf/training_results_v2.1
+	url = https://github.com/mlcommons/training_results_v2.1.git
diff --git a/benchmarks/huggingface/bench/__main__.py b/benchmarks/huggingface/bench/__main__.py
@@ -12,6 +12,7 @@
 from .synth import SyntheticData, generators
 
 
+
 def is_tf32_allowed(args):
     return "tf32" in args.precision
 
@@ -56,11 +57,12 @@ def __init__(self, args):
 
         example = next(iter(self.loader))
         example = {k: x.to(self.device) for k, x in example.items()}
-
-        # print({k: x.shape for k, x in example.items()})
-
+
         model = ModelWrapper(info.model).to(self.device)
-        model = torch.jit.trace(model, example)
+
+        jit = False
+        if jit:
+            model = torch.jit.trace(model, example)
 
         self.model = model
         self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
@@ -87,8 +89,6 @@ def train(self):
             "train", self.loader, report_batch=True, batch_size=self.batch_size
         ):
             data = {k: v.to(self.device) for k, v in data.items()}
-
-            template = {k: (v.shape, v.dtype) for k, v in data.items()}
             self.step(data)
 
 

diff --git a/benchmarks/mlperf/apex b/benchmarks/mlperf/apex
diff --git a/benchmarks/mlperf/benchfile.py b/benchmarks/mlperf/benchfile.py
@@ -0,0 +1,10 @@
+from milabench.pack import Package
+
+
+class MLPerfBenchmark(Package):
+    base_requirements = "requirements.in"
+    main_script = "main.py"
+
+
+__pack__ = MLPerfBenchmark
+
diff --git a/benchmarks/mlperf/main.py b/benchmarks/mlperf/main.py
@@ -0,0 +1,13 @@
+
+import sys
+import os
+
+
+FOLDER = os.path.dirname(__file__)
+BENCH = "training_results_v2.1/NVIDIA/benchmarks/bert/implementations/pytorch-preview"
+
+print(sys.path)
+sys.path.append(os.path.join(FOLDER, BENCH))
+print(sys.path)
+
+import run_squad
diff --git a/benchmarks/mlperf/requirements.in b/benchmarks/mlperf/requirements.in
@@ -0,0 +1,4 @@
+git+https://github.com/NVIDIA/mlperf-common.git
+git+https://github.com/NVIDIA/apex.git
+git+https://github.com/mlcommons/logging.git
+boto3
diff --git a/benchmarks/mlperf/training_results_v2.1 b/benchmarks/mlperf/training_results_v2.1
diff --git a/config/base.yaml b/config/base.yaml
@@ -10,6 +10,16 @@ _defaults:
       gpu_load_threshold: 0.5
       gpu_mem_threshold: 0.5
 
+
+mlperf:
+  inherits: _defaults
+  definition: ../benchmarks/mlperf
+  group: mlperf
+  install_group: torch
+  plan:
+    method: per_gpu
+
+
 _torchvision:
   inherits: _defaults
   definition: ../benchmarks/torchvision
@@ -174,7 +184,7 @@ _bert-base:
     - precision-showcase
   argv:
     --model: "Bert"
-    --batch-size: 32
+    --batch-size: 48
   voir:
     options:
       stop: 30