cornell-zhang
diff --git a/‎.github/workflows/config.yml
+21-2 b/‎.github/workflows/config.yml
+21-2
diff --git a/‎allo/backend/llvm.py
+1-1 b/‎allo/backend/llvm.py
+1-1
diff --git a/‎allo/frontend/pytorch.py
+54-32 b/‎allo/frontend/pytorch.py
+54-32
diff --git a/‎allo/library/__init__.py
+12-6 b/‎allo/library/__init__.py
+12-6
diff --git a/‎allo/library/nn.py
+50-11 b/‎allo/library/nn.py
+50-11
diff --git a/‎examples/torch/self_attention.py renamed to ‎examples/torch/attention.py b/‎examples/torch/self_attention.py renamed to ‎examples/torch/attention.py
diff --git a/‎examples/torch/bert_layer.py renamed to ‎examples/torch/bert.py b/‎examples/torch/bert_layer.py renamed to ‎examples/torch/bert.py
diff --git a/‎examples/torch/HF_bert_layer.py renamed to ‎examples/torch/experimental/HF_bert_layer.py b/‎examples/torch/HF_bert_layer.py renamed to ‎examples/torch/experimental/HF_bert_layer.py
diff --git a/‎examples/torch/gptneo_KV_cache.py renamed to ‎examples/torch/experimental/gptneo_KV_cache.py b/‎examples/torch/gptneo_KV_cache.py renamed to ‎examples/torch/experimental/gptneo_KV_cache.py
diff --git a/‎examples/torch/gpt2.py
+17-6 b/‎examples/torch/gpt2.py
+17-6
diff --git a/‎mlir/include/allo/Dialect/Visitor.h
+2-1 b/‎mlir/include/allo/Dialect/Visitor.h
+2-1
@@ -39,12 +39,31 @@ jobs:
       run: |
         source activate allo
         bash scripts/lint/task_lint.sh
-    - name: Testing
+    - name: Unit tests
       shell: bash
       run: |
         source activate allo
         export PATH=/root/llvm-project/build/bin:${PATH}
         export LLVM_BUILD_DIR=/root/llvm-project/build
         python3 -m pytest tests -v
+    - name: Tutorial
+      shell: bash
+      run: |
+        source activate allo
+        export LLVM_BUILD_DIR=/root/llvm-project/build
         python3 -m pytest tutorials -v
-        python3 -m pytest examples/polybench -v
+    - name: Benchmark
+      shell: bash
+      run: |
+        source activate allo
+        export LLVM_BUILD_DIR=/root/llvm-project/build
+        python3 -m pytest examples/polybench -v
+    # no left space!
+    # - name: PyTorch
+    #   shell: bash
+    #   run: |
+    #     source activate allo
+    #     export LLVM_BUILD_DIR=/root/llvm-project/build
+    #     python3 -m pip install torch==2.5.1
+    #     python3 examples/torch/toy.py
+    #     python3 examples/torch/mlp.py
@@ -1,6 +1,6 @@
 # Copyright Allo authors. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
-# pylint: disable=no-name-in-module, inconsistent-return-statements, too-many-function-args
+# pylint: disable=no-name-in-module, inconsistent-return-statements
 
 import os
 import ctypes
 
@@ -1,8 +1,7 @@
 # Copyright Allo authors. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
-# pylint: disable=too-many-public-methods
+# pylint: disable=too-many-public-methods, too-many-instance-attributes
 
-import re
 import operator
 import inspect
 import math
@@ -23,11 +22,6 @@
 from ..customize import customize
 from ..ir.types import float32
 
-compose_mapping = {
-    "linear": nn.linear,
-    "relu": nn.relu,
-}
-
 
 def from_pytorch(
     model,
@@ -77,8 +71,7 @@ def from_pytorch(
     s = customize(code, global_vars=global_vars, enable_tensor=enable_tensor)
     # composition
     for func, idx, inst in builder.composition:
-        if func in compose_mapping:
-            s.compose(compose_mapping[func], id=idx, instantiate=inst)
+        s.compose(getattr(nn, func), id=idx, instantiate=inst)
     if verbose:
         print(s.module)
     if target == "mlir":
@@ -104,6 +97,7 @@ def __init__(self, gm, example_inputs, leaf_modules=None):
         self.subfunctions = []
         self.output = []
         self.composition = []
+        self.unique_id = {}
 
     def build(self):
         for node in self.gm.graph.nodes:
@@ -155,6 +149,13 @@ def __call__(self, node):
             self.code.append(ret)
         return ret
 
+    def get_unique_id(self, name):
+        if name not in self.unique_id:
+            self.unique_id[name] = 0
+            return 0
+        self.unique_id[name] += 1
+        return self.unique_id[name]
+
     def get_module(self, name):
         return dict(self.gm.named_modules())[name]
 
@@ -180,8 +181,13 @@ def build_call_module(self, node):
             raise NotImplementedError("Unsupported module")
         if op == "linear":
             bias = True if module.bias is not None else None
-            return getattr(self, "build_linear")(node, bias)
-        return getattr(self, f"build_{op}")(node)
+            res = getattr(self, "build_linear")(node, bias)
+        else:
+            res = getattr(self, f"build_{op}")(node)
+        # append shape after the operation
+        if "tensor_meta" in node.meta:
+            res += f'  # shape: {str(tuple(node.meta["tensor_meta"].shape))}'
+        return res
 
     def build_call_function(self, node):
         opcls = {
@@ -203,11 +209,12 @@ def build_call_function(self, node):
             torch.cat: "concat",
         }.get(node.target)
         # Only nodes with shape need to be built.
-        return (
-            getattr(self, f"build_{opcls}")(node)
-            if "tensor_meta" in node.meta
-            else None
-        )
+        if "tensor_meta" in node.meta:
+            res = getattr(self, f"build_{opcls}")(node)
+            # append shape after the operation
+            res += f'  # shape: {str(tuple(node.meta["tensor_meta"].shape))}'
+            return res
+        return None
 
     def build_call_method(self, node):
         if node.target == "contiguous":
@@ -298,29 +305,44 @@ def build_softmax(self, node):
 
     def build_relu(self, node):
         inp = get_var_name(node.args[0])
-        bs, n = tuple(node.meta["tensor_meta"].shape)
-        match = re.search(r"\d+$", str(node.target).replace(".", "_"))
-        self.composition.append(
-            ("relu", match.group() if match else None, [float32, bs, n])
-        )
-        return f"{node.name} = nn.relu[float32, {bs}, {n}]({inp})"
+        shape = tuple(node.meta["tensor_meta"].shape)
+        name_id = self.get_unique_id("relu")
+        if len(shape) == 2:
+            n, d = shape
+            self.composition.append(("relu2d", name_id, [float32, n, d]))
+            return f'{node.name} = nn.relu2d[float32, {n}, {d}, "{name_id}"]({inp})'
+        if len(shape) == 4:
+            n, c, h, w = shape
+            self.composition.append(("relu4d", name_id, [float32, n, c, h, w]))
+            return f'{node.name} = nn.relu4d[float32, {n}, {c}, {h}, {w}, "{name_id}"]({inp})'
+        raise NotImplementedError("Unsupported shape for relu")
 
     def build_linear(self, node, bias):
         target_name = node.target.replace(".", "_")
         inp = get_var_name(node.args[0])
         weight = get_var_name(target_name + "_weight")
         if bias:
             bias = get_var_name(target_name + "_bias")
-            # output shape: bs * n
-            bs, n = tuple(node.meta["tensor_meta"].shape)
-            _, m = self.named_params[f"{str(node.target)}.weight"].shape
-            match = re.search(r"\d+$", target_name)
-            name = f', "{match.group()}"' if match else ""
-            # bs*m x (n*m)^T + (n*1) = bs*n
-            self.composition.append(
-                ("linear", match.group() if match else None, [float32, bs, n, m])
-            )
-            return f"{node.name} = nn.linear[float32, {bs}, {n}, {m}{name}]({inp}, {weight}, {bias})"
+            shape = tuple(node.meta["tensor_meta"].shape)
+            name_id = self.get_unique_id("linear")
+            if len(shape) == 2:
+                n, d = shape
+                _, m = self.named_params[f"{str(node.target)}.weight"].shape
+                # n*m x (m*d)^T + (n*1) = n*d
+                self.composition.append(("linear2d", name_id, [float32, n, d, m]))
+                return f'{node.name} = nn.linear2d[float32, {n}, {d}, {m}, "{name_id}"]({inp}, {weight}, {bias})'
+            if len(shape) == 3:
+                bs, l, m = shape
+                _, d = self.named_params[f"{str(node.target)}.weight"].shape
+                self.composition.append(
+                    (
+                        "linear3d",
+                        name_id,
+                        [float32, bs, l, d, m],
+                    )
+                )
+                return f'{node.name} = nn.linear3d[float32, {bs}, {l}, {d}, {m}, "{name_id}"]({inp}, {weight}, {bias})'
+            raise NotImplementedError("Unsupported shape for linear")
         return f"{node.name} = dsl.linear({inp}, {weight})"
 
     def build_gelu(self, node):
 
@@ -16,10 +16,14 @@
 )
 
 from .nn import (
-    linear,
-    schedule_linear,
-    relu,
-    schedule_relu,
+    linear2d,
+    linear3d,
+    schedule_linear2d,
+    schedule_linear3d,
+    relu2d,
+    relu4d,
+    schedule_relu2d,
+    schedule_relu4d,
     softmax,
     schedule_softmax,
     layer_norm,
@@ -42,8 +46,10 @@
 
 KERNEL2SCHEDULE.update(
     {
-        linear: schedule_linear,
-        relu: schedule_relu,
+        linear2d: schedule_linear2d,
+        linear3d: schedule_linear3d,
+        relu2d: schedule_relu2d,
+        relu4d: schedule_relu4d,
         softmax: schedule_softmax,
         layer_norm: schedule_layernorm,
         GeLU: schedule_gelu,
 
@@ -6,7 +6,7 @@
 from .systolic import systolic
 
 
-def linear[Ty, M, N, K](X: "Ty[M, K]", W: "Ty[N, K]", b: "Ty[N]") -> "Ty[M, N]":
+def linear2d[Ty, M, N, K](X: "Ty[M, K]", W: "Ty[N, K]", b: "Ty[N]") -> "Ty[M, N]":
     # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
     Z: Ty[M, N]
     buf: Ty[N]
@@ -23,22 +23,61 @@ def linear[Ty, M, N, K](X: "Ty[M, K]", W: "Ty[N, K]", b: "Ty[N]") -> "Ty[M, N]":
     return Z
 
 
-def schedule_linear(s):
-    s.pipeline("linear:j")
-    s.pipeline("linear:j_init")
-    s.pipeline("linear:j_back")
+def schedule_linear2d(s):
+    s.pipeline("linear2d:j")
+    s.pipeline("linear2d:j_init")
+    s.pipeline("linear2d:j_back")
     return s
 
 
-def relu[Ty, L, D](X: "Ty[L, D]") -> "Ty[L, D]":
-    Z: Ty[L, D]
-    for i, j in dsl.grid(L, D):
-        Z[i, j] = max(0.0, X[i, j])
+def linear3d[
+    Ty, B, L, D, M
+](X: "Ty[B, L, D]", W: "Ty[M, D]", bias: "Ty[M]") -> "Ty[B, L, M]":
+    # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
+    Z: Ty[B, L, M]
+    buf: Ty[M]
+    for b in range(B):
+        for i in range(L):
+            for j_init in range(M):
+                buf[j_init] = 0
+            for k in range(D):
+                # reorder reduction loop outside, and pipeline
+                x: Ty = X[b, i, k]
+                for j in range(M):
+                    buf[j] += x * W[j, k]
+            for j_back in range(M):
+                Z[b, i, j_back] = buf[j_back] + bias[j_back]
+    return Z
+
+
+def schedule_linear3d(s):
+    s.pipeline("linear3d:j")
+    s.pipeline("linear3d:j_init")
+    s.pipeline("linear3d:j_back")
+    return s
+
+
+def relu2d[Ty, H, W](X: "Ty[H, W]") -> "Ty[H, W]":
+    Z: Ty[H, W]
+    for h, w in dsl.grid(H, W):
+        Z[h, w] = max(0.0, X[h, w])
+    return Z
+
+
+def schedule_relu2d(s):
+    s.pipeline("relu2d:w")
+    return s
+
+
+def relu4d[Ty, N, C, H, W](X: "Ty[N, C, H, W]") -> "Ty[N, C, H, W]":
+    Z: Ty[N, C, H, W]
+    for n, c, h, w in dsl.grid(N, C, H, W):
+        Z[n, c, h, w] = max(0.0, X[n, c, h, w])
     return Z
 
 
-def schedule_relu(s):
-    s.pipeline("relu:j")
+def schedule_relu4d(s):
+    s.pipeline("relu4d:w")
     return s
 
 
 
@@ -108,11 +108,17 @@ def forward(self, x):
         return output
 
 
-vocab_size = 50257
-n_embd = 768
-n_head = 12
-n_layers = 12
-n_seq = 1024
+# Large size
+# vocab_size = 50257
+# n_embd = 768
+# n_head = 12
+# n_layers = 12
+# n_seq = 1024
+vocab_size = 2
+n_embd = 4
+n_head = 2
+n_layers = 1
+n_seq = 4
 batch_size = 2
 
 module = GPT2(vocab_size, n_embd, n_head, n_layers).eval()
@@ -121,10 +127,15 @@ def forward(self, x):
 llvm_mod = allo.frontend.from_pytorch(
     module,
     example_inputs=example_inputs,
-    verbose=False,
+    verbose=True,
 )
 
 golden = module(*example_inputs)
 np_inputs = [x.detach().numpy() for x in example_inputs]
 res = llvm_mod(*np_inputs)
 np.testing.assert_allclose(res, golden.detach().numpy(), atol=1e-3)
+print("Test passed!")
+
+# generate HLS module
+mod = allo.frontend.from_pytorch(module, example_inputs=example_inputs, target="vhls")
+print(mod.hls_code)
@@ -43,7 +43,7 @@ class HLSCppVisitorBase {
             // Memref-related statements.
             memref::AllocOp, memref::AllocaOp, memref::LoadOp, memref::StoreOp,
             memref::GetGlobalOp, allo::GetGlobalFixedOp, memref::GlobalOp,
-            memref::DeallocOp, memref::DmaStartOp, memref::DmaWaitOp,
+            memref::DeallocOp, memref::DmaStartOp, memref::DmaWaitOp, memref::ReshapeOp,
             memref::ViewOp, memref::SubViewOp, memref::ReinterpretCastOp,
             memref::AtomicRMWOp,
             // Tensor-related statements.
@@ -144,6 +144,7 @@ class HLSCppVisitorBase {
   HANDLE(memref::ViewOp);
   HANDLE(memref::SubViewOp);
   HANDLE(memref::ReinterpretCastOp);
+  HANDLE(memref::ReshapeOp);
 
   // Tensor-related statements.
   HANDLE(tensor::ExtractOp);