EricLBuehler · EricLBuehler · Apr 24, 2025 · Apr 23, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/constensor-core/Cargo.toml b/constensor-core/Cargo.toml
@@ -16,6 +16,8 @@ half = { workspace = true, optional = true }
 dirs = "5.0.1"
 rayon = "1.10.0"
 petgraph.workspace = true
+gemm = "0.18"
+num_cpus = "1.16.0"
 
 [features]
 default = ["half", "bfloat"]

diff --git a/constensor-core/examples/hello_world/main.rs b/constensor-core/examples/hello_world/main.rs
@@ -4,16 +4,18 @@ fn main() {
     let mut graph: Graph<f32> = Graph::empty();
     let arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
     dbg!(&arange.to_tensor().unwrap().data());
-    let x = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
-    let y = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
-    let z = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
-    let res = y * x + z;
+    let a = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
+    let b = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
+    let c = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 3.0);
+    let d = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 4.0);
+    let res = a * b + c;
+    let res = res + d;
 
     graph.optimize();
 
     graph.visualize("graph.png").unwrap();
 
     let tensor: Tensor<R2<3, 4>, f32, Cpu> = res.to_tensor().unwrap();
 
-    assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![4.0; 4]; 3],);
+    assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![9.0; 4]; 3],);
 }
diff --git a/constensor-core/examples/matmul/main.rs b/constensor-core/examples/matmul/main.rs
@@ -0,0 +1,41 @@
+use constensor_core::{Cpu, DType, Graph, GraphTensor, R3};
+use std::time::Instant;
+
+fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
+    type_name: &str,
+    alpha: T,
+    beta: T,
+) {
+    // Number of times to run the matmul for averaging
+    let iterations = 1000;
+    let mut total = std::time::Duration::new(0, 0);
+
+    for _ in 0..iterations {
+        let start = Instant::now();
+
+        let mut graph = Graph::empty();
+        let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
+        let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
+        let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
+        let c = a.matmul_axpby(b, o, alpha, beta);
+
+        graph.optimize();
+
+        let _tensor = std::hint::black_box(c.to_tensor().unwrap());
+
+        total += start.elapsed();
+    }
+
+    let avg = total / (iterations as u32);
+    println!("Average execution time for {type_name} over {iterations} iterations: {avg:?}");
+}
+
+fn main() {
+    const B: usize = 1;
+    const M: usize = 128;
+    const N: usize = 128;
+    const K: usize = 128;
+
+    bench::<f32, B, M, K, N>("f32", 1.0, 1.0);
+    bench::<i32, B, M, K, N>("i32", 1, 1);
+}
diff --git a/constensor-core/src/cpu_storage/mod.rs b/constensor-core/src/cpu_storage/mod.rs
@@ -8,9 +8,8 @@ use pool::{BufferPool, PooledBuffer};
 use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
 
 use crate::{
-    graph::GraphTensorId,
     storage::{BackendDevice, BackendStorage},
-    DType, Op, Result, Shape,
+    DType, GraphNode, Op, Result,
 };
 
 mod pool;
@@ -30,10 +29,7 @@ impl<T: DType> BackendStorage<T> for CpuStorage<T> {
 impl BackendDevice for CpuDevice {
     type Storage<X: DType> = CpuStorage<X>;
 
-    fn compile_and_run_graph<S: Shape, T: DType>(
-        &self,
-        graph: &[Op<T>],
-    ) -> Result<Self::Storage<T>> {
+    fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Self::Storage<T>> {
         {
             // Create a shared buffer pool
             let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));
@@ -45,27 +41,22 @@ impl BackendDevice for CpuDevice {
             }
 
             for (idx, node) in graph.iter().enumerate() {
-                match node {
-                    Op::BinaryOp { l_id, r_id, .. } | Op::InplaceBinaryOp { l_id, r_id, .. } => {
-                        let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
-                        let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
-                        dep_graph.add_edge(l_idx, idx, ());
-                        dep_graph.add_edge(r_idx, idx, ());
+                match &node.op {
+                    Op::BinaryOp { l_id, r_id, .. } => {
+                        dep_graph.add_edge(l_id.get(), idx, ());
+                        dep_graph.add_edge(r_id.get(), idx, ());
                     }
                     Op::UnaryOp { v_id, .. } => {
-                        let v_idx = <&GraphTensorId as Into<usize>>::into(v_id);
-                        dep_graph.add_edge(v_idx, idx, ());
+                        dep_graph.add_edge(v_id.get(), idx, ());
                     }
-                    Op::FusedMulAdd { a_id, b_id, c_id }
-                    | Op::InplaceFusedMulAdd {
-                        a_id, b_id, c_id, ..
-                    } => {
-                        let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
-                        let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
-                        let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
-                        dep_graph.add_edge(a_idx, idx, ());
-                        dep_graph.add_edge(b_idx, idx, ());
-                        dep_graph.add_edge(c_idx, idx, ());
+                    Op::FusedMulAdd { a_id, b_id, c_id } => {
+                        dep_graph.add_edge(a_id.get(), idx, ());
+                        dep_graph.add_edge(b_id.get(), idx, ());
+                        dep_graph.add_edge(c_id.get(), idx, ());
+                    }
+                    Op::MatMul { l_id, r_id, .. } => {
+                        dep_graph.add_edge(l_id.get(), idx, ());
+                        dep_graph.add_edge(r_id.get(), idx, ());
                     }
                     // NoOp and Fill/Arange don’t create incoming edges
                     Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
@@ -82,50 +73,41 @@ impl BackendDevice for CpuDevice {
             // Evaluate nodes in topological order
             for idx in order {
                 let op = &graph[idx];
-                let computed = match op {
+
+                let out_shape = &op.shape;
+                let out_elem_count: usize = out_shape.iter().product();
+
+                let computed = match &op.op {
                     Op::BinaryOp {
                         l_id,
                         r_id,
                         operator,
                     } => {
-                        let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
-                        let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
-                        let l_buf = results[l_idx].as_ref().unwrap();
-                        let r_buf = results[r_idx].as_ref().unwrap();
-                        let mut out = pool.borrow_mut().get_buffer(S::element_count());
-                        T::binary_simd_op(l_buf, r_buf, &mut out, *operator);
-                        PooledBuffer::new(out, pool.clone())
-                    }
-                    Op::InplaceBinaryOp {
-                        out,
-                        l_id,
-                        r_id,
-                        operator,
-                    } => {
-                        let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
-                        let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
-                        let o_idx = <&GraphTensorId as Into<usize>>::into(out);
-                        if o_idx == l_idx {
-                            let mut l_buf = results[l_idx].take().unwrap();
-                            let r_buf = results[r_idx].as_ref().unwrap();
+                        if l_id.is_inplace() {
+                            let mut l_buf = results[l_id.get()].take().unwrap();
+                            let r_buf = results[r_id.get()].as_ref().unwrap();
                             T::binary_simd_op_inplace_lhs(&mut l_buf, r_buf, *operator);
                             l_buf
-                        } else if o_idx == r_idx {
-                            let mut r_buf = results[r_idx].take().unwrap();
-                            let l_buf = results[l_idx].as_ref().unwrap();
+                        } else if r_id.is_inplace() {
+                            let mut r_buf = results[r_id.get()].take().unwrap();
+                            let l_buf = results[l_id.get()].as_ref().unwrap();
                             T::binary_simd_op_inplace_rhs(l_buf, &mut r_buf, *operator);
                             r_buf
                         } else {
-                            unreachable!()
+                            let l_buf = results[l_id.get()].as_ref().unwrap();
+                            let r_buf = results[r_id.get()].as_ref().unwrap();
+                            let mut out = pool.borrow_mut().get_buffer(out_elem_count);
+                            T::binary_simd_op(l_buf, r_buf, &mut out, *operator);
+                            PooledBuffer::new(out, pool.clone())
                         }
                     }
                     Op::Fill { v } => {
-                        let mut buf = pool.borrow_mut().get_empty_buffer(S::element_count());
-                        buf.extend(std::iter::repeat_n(*v, S::element_count()));
+                        let mut buf = pool.borrow_mut().get_empty_buffer(out_elem_count);
+                        buf.extend(std::iter::repeat_n(*v, out_elem_count));
                         PooledBuffer::new(buf, pool.clone())
                     }
                     Op::Arange { start, step, stop } => {
-                        let mut buf = pool.borrow_mut().get_empty_buffer(S::element_count());
+                        let mut buf = pool.borrow_mut().get_empty_buffer(out_elem_count);
                         let mut x = start.to_f64();
                         while x < stop.to_f64() {
                             buf.push(T::from_f64(x));
@@ -134,63 +116,80 @@ impl BackendDevice for CpuDevice {
                         PooledBuffer::new(buf, pool.clone())
                     }
                     Op::UnaryOp { v_id, operator } => {
-                        let v_idx = <&GraphTensorId as Into<usize>>::into(v_id);
-                        let buf = results[v_idx].as_ref().unwrap();
+                        let buf = results[v_id.get()].as_ref().unwrap();
                         let op_fn = operator.to_closure();
-                        let mut out = pool.borrow_mut().get_buffer(S::element_count());
+                        let mut out = pool.borrow_mut().get_buffer(out_elem_count);
                         out.par_iter_mut()
                             .zip(&**buf)
                             .for_each(|(out, x): (&mut T, &T)| *out = op_fn(*x));
                         PooledBuffer::new(out, pool.clone())
                     }
                     Op::FusedMulAdd { a_id, b_id, c_id } => {
-                        let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
-                        let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
-                        let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
-                        let a_buf = results[a_idx].as_ref().unwrap();
-                        let b_buf = results[b_idx].as_ref().unwrap();
-                        let c_buf = results[c_idx].as_ref().unwrap();
-
-                        let mut out = pool.borrow_mut().get_buffer(S::element_count());
-                        T::fma_op(a_buf, b_buf, c_buf, &mut out);
-                        PooledBuffer::new(out, pool.clone())
-                    }
-                    Op::InplaceFusedMulAdd {
-                        a_id,
-                        b_id,
-                        c_id,
-                        out,
-                    } => {
-                        let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
-                        let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
-                        let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
-                        let o_idx = <&GraphTensorId as Into<usize>>::into(out);
-
-                        if o_idx == a_idx {
-                            let mut a_buf = results[a_idx].take().unwrap();
-                            let b_buf = results[b_idx].as_ref().unwrap();
-                            let c_buf = results[c_idx].as_ref().unwrap();
+                        if a_id.is_inplace() {
+                            let mut a_buf = results[a_id.get()].take().unwrap();
+                            let b_buf = results[b_id.get()].as_ref().unwrap();
+                            let c_buf = results[c_id.get()].as_ref().unwrap();
 
                             T::fma_op_inplace_a(&mut a_buf, b_buf, c_buf);
                             a_buf
-                        } else if o_idx == b_idx {
-                            let mut b_buf = results[b_idx].take().unwrap();
-                            let a_buf = results[a_idx].as_ref().unwrap();
-                            let c_buf = results[c_idx].as_ref().unwrap();
+                        } else if b_id.is_inplace() {
+                            let mut b_buf = results[b_id.get()].take().unwrap();
+                            let a_buf = results[a_id.get()].as_ref().unwrap();
+                            let c_buf = results[c_id.get()].as_ref().unwrap();
 
                             T::fma_op_inplace_b(a_buf, &mut b_buf, c_buf);
                             b_buf
-                        } else if o_idx == c_idx {
-                            let mut c_buf = results[c_idx].take().unwrap();
-                            let a_buf = results[a_idx].as_ref().unwrap();
-                            let b_buf = results[b_idx].as_ref().unwrap();
+                        } else if c_id.is_inplace() {
+                            let mut c_buf = results[c_id.get()].take().unwrap();
+                            let a_buf = results[a_id.get()].as_ref().unwrap();
+                            let b_buf = results[b_id.get()].as_ref().unwrap();
 
                             T::fma_op_inplace_c(a_buf, b_buf, &mut c_buf);
                             c_buf
                         } else {
-                            unreachable!()
+                            let a_buf = results[a_id.get()].as_ref().unwrap();
+                            let b_buf = results[b_id.get()].as_ref().unwrap();
+                            let c_buf = results[c_id.get()].as_ref().unwrap();
+
+                            let mut out = pool.borrow_mut().get_buffer(out_elem_count);
+                            T::fma_op(a_buf, b_buf, c_buf, &mut out);
+                            PooledBuffer::new(out, pool.clone())
                         }
                     }
+                    // Matrix multiplication: multiply two 2D tensors A (m x k) and B (k x n)
+                    Op::MatMul {
+                        l_id,
+                        r_id,
+                        o_id,
+                        k,
+                        alpha,
+                        beta,
+                    } => {
+                        // Determine output dimensions from shape S (must be 2D)
+                        let shape = out_shape;
+                        assert!(shape.len() == 3);
+                        let b = shape[0];
+                        let m = shape[1];
+                        let n = shape[2];
+
+                        let mut out = if let Some(o_id) = o_id {
+                            if o_id.is_inplace() {
+                                results[o_id.get()].take().unwrap()
+                            } else {
+                                let o_buf = results[o_id.get()].as_ref().unwrap();
+                                PooledBuffer::new((*o_buf).clone(), pool.clone())
+                            }
+                        } else {
+                            PooledBuffer::new(pool.borrow_mut().get_buffer(m * n), pool.clone())
+                        };
+
+                        let a_buf = results[l_id.get()].as_ref().unwrap();
+                        let b_buf = results[r_id.get()].as_ref().unwrap();
+
+                        T::launch_gemm(a_buf, b_buf, b, m, n, *k, &mut out, *alpha, *beta);
+
+                        out
+                    }
                     Op::NoOp => unreachable!("NoOp should not be evaluated."),
                 };
                 results[idx] = Some(computed);

diff --git a/constensor-core/src/cpu_storage/pool.rs b/constensor-core/src/cpu_storage/pool.rs
@@ -22,6 +22,7 @@ pub struct PoolMetrics {
     pub drops: usize,
 }
 
+#[derive(Debug)]
 /// A simple buffer pool to reuse Vec allocations across graph evaluation.
 pub struct BufferPool<T> {
     pool: Vec<Vec<T>>,
@@ -32,6 +33,7 @@ pub struct BufferPool<T> {
 /// Shared reference to a BufferPool for automatic recycling.
 pub type SharedPool<T> = Rc<RefCell<BufferPool<T>>>;
 
+#[derive(Debug)]
 /// Wrapper around Vec<T> that returns its buffer to the pool on drop.
 pub struct PooledBuffer<T: DType> {
     buf: Vec<T>,

diff --git a/constensor-core/src/device.rs b/constensor-core/src/device.rs
@@ -3,7 +3,7 @@ use crate::cuda_backend::CudaDevice;
 use crate::{
     cpu_storage::CpuDevice,
     storage::{BackendDevice, Storage},
-    DType, Op, Result, Shape,
+    DType, GraphNode, Result,
 };
 
 /// Marker trait for devices
@@ -66,13 +66,11 @@ pub enum Device {
 }
 
 impl Device {
-    pub fn compile_and_run_graph<T: DType, S: Shape>(&self, graph: &[Op<T>]) -> Result<Storage<T>> {
+    pub fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Storage<T>> {
         match self {
             #[cfg(feature = "cuda")]
             Self::Cuda(cuda) => Ok(Storage::Cuda(cuda.compile_and_run_graph::<S, T>(graph)?)),
-            Self::Cpu => Ok(Storage::Cpu(
-                CpuDevice.compile_and_run_graph::<S, T>(graph)?,
-            )),
+            Self::Cpu => Ok(Storage::Cpu(CpuDevice.compile_and_run_graph::<T>(graph)?)),
         }
     }
 }