EricLBuehler · EricLBuehler · Apr 25, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -0,0 +1,18 @@
+[target.x86_64-unknown-linux-gnu]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.aarch64-apple-darwin]
+[build]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.wasm32-unknown-unknown]
+rustflags = ["-C", "target-feature=+simd128"]
+
+[target.x86_64-apple-darwin]
+rustflags = ["-C", "target-feature=-avx,-avx2"]
diff --git a/constensor-core/examples/hello_world/main.rs b/constensor-core/examples/hello_world/main.rs
@@ -2,20 +2,22 @@ use constensor_core::{Cpu, Graph, GraphTensor, Tensor, R1, R2};
 
 fn main() {
     let mut graph: Graph<f32> = Graph::empty();
-    let arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
-    dbg!(&arange.to_tensor().unwrap().data());
+    let _arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
     let a = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
     let b = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
     let c = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 3.0);
     let d = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 4.0);
     let res = a * b + c;
-    let res = res + d;
+    let _out = res + d;
 
     graph.optimize();
 
     graph.visualize("graph.png").unwrap();
 
-    let tensor: Tensor<R2<3, 4>, f32, Cpu> = res.to_tensor().unwrap();
+    let compiled: constensor_core::CompiledGraph<R2<3, 4>, f32, Cpu> = graph.compile().unwrap();
+    let res = compiled.run().unwrap();
+
+    let tensor: Tensor<R2<3, 4>, f32, Cpu> = res;
 
     assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![9.0; 4]; 3],);
 }
diff --git a/constensor-core/examples/matmul/main.rs b/constensor-core/examples/matmul/main.rs
@@ -1,4 +1,4 @@
-use constensor_core::{Cpu, DType, Graph, GraphTensor, R3};
+use constensor_core::{CompiledGraph, Cpu, DType, Graph, GraphTensor, R3};
 use std::time::Instant;
 
 fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
@@ -10,18 +10,19 @@ fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usiz
     let iterations = 1000;
     let mut total = std::time::Duration::new(0, 0);
 
-    for _ in 0..iterations {
-        let start = Instant::now();
+    let mut graph = Graph::empty();
+    let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
+    let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
+    let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
+    let _c = a.matmul_axpby(b, o, alpha, beta);
 
-        let mut graph = Graph::empty();
-        let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
-        let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
-        let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
-        let c = a.matmul_axpby(b, o, alpha, beta);
+    graph.optimize();
+    let compiled: CompiledGraph<R3<B, M, N>, T, Cpu> = graph.compile().unwrap();
 
-        graph.optimize();
+    for _ in 0..iterations {
+        let start = Instant::now();
 
-        let _tensor = std::hint::black_box(c.to_tensor().unwrap());
+        let _tensor = std::hint::black_box(compiled.run().unwrap());
 
         total += start.elapsed();
     }

diff --git a/constensor-core/src/cpu_storage/mod.rs b/constensor-core/src/cpu_storage/mod.rs
@@ -1,15 +1,17 @@
 use petgraph::algo::toposort;
 use petgraph::graphmap::DiGraphMap;
-use std::borrow::Cow;
 use std::cell::RefCell;
 use std::rc::Rc;
+use std::{borrow::Cow, marker::PhantomData};
 
 use pool::{BufferPool, PooledBuffer};
 use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
 
+use crate::device::Dev;
+use crate::Shape;
 use crate::{
     storage::{BackendDevice, BackendStorage},
-    DType, GraphNode, Op, Result,
+    CompiledGraph, DType, GraphNode, Op, Result,
 };
 
 mod pool;
@@ -29,49 +31,73 @@ impl<T: DType> BackendStorage<T> for CpuStorage<T> {
 impl BackendDevice for CpuDevice {
     type Storage<X: DType> = CpuStorage<X>;
 
-    fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Self::Storage<T>> {
-        {
-            // Create a shared buffer pool
-            let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));
-
-            // Build a dependency graph of tensor indices
-            let mut dep_graph = DiGraphMap::<usize, ()>::new();
-            for idx in 0..graph.len() {
-                dep_graph.add_node(idx);
-            }
+    fn compile<S: Shape, T: DType, D: Dev>(
+        &self,
+        graph: Vec<GraphNode<T>>,
+    ) -> Result<CompiledGraph<S, T, D>> {
+        // Build a dependency graph of tensor indices
+        let mut dep_graph = DiGraphMap::<usize, ()>::new();
+        for idx in 0..graph.len() {
+            dep_graph.add_node(idx);
+        }
 
-            for (idx, node) in graph.iter().enumerate() {
-                match &node.op {
-                    Op::BinaryOp { l_id, r_id, .. } => {
-                        dep_graph.add_edge(l_id.get(), idx, ());
-                        dep_graph.add_edge(r_id.get(), idx, ());
-                    }
-                    Op::UnaryOp { v_id, .. } => {
-                        dep_graph.add_edge(v_id.get(), idx, ());
-                    }
-                    Op::FusedMulAdd { a_id, b_id, c_id } => {
-                        dep_graph.add_edge(a_id.get(), idx, ());
-                        dep_graph.add_edge(b_id.get(), idx, ());
-                        dep_graph.add_edge(c_id.get(), idx, ());
-                    }
-                    Op::MatMul { l_id, r_id, .. } => {
-                        dep_graph.add_edge(l_id.get(), idx, ());
-                        dep_graph.add_edge(r_id.get(), idx, ());
-                    }
-                    // NoOp and Fill/Arange don’t create incoming edges
-                    Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
+        for (idx, node) in graph.iter().enumerate() {
+            match &node.op {
+                Op::BinaryOp { l_id, r_id, .. } => {
+                    dep_graph.add_edge(l_id.get(), idx, ());
+                    dep_graph.add_edge(r_id.get(), idx, ());
                 }
+                Op::UnaryOp { v_id, .. } => {
+                    dep_graph.add_edge(v_id.get(), idx, ());
+                }
+                Op::FusedMulAdd { a_id, b_id, c_id } => {
+                    dep_graph.add_edge(a_id.get(), idx, ());
+                    dep_graph.add_edge(b_id.get(), idx, ());
+                    dep_graph.add_edge(c_id.get(), idx, ());
+                }
+                Op::MatMul { l_id, r_id, .. } => {
+                    dep_graph.add_edge(l_id.get(), idx, ());
+                    dep_graph.add_edge(r_id.get(), idx, ());
+                }
+                // NoOp and Fill/Arange don’t create incoming edges
+                Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
             }
+        }
+
+        // Compute topological order
+        let order = toposort(&dep_graph, None).expect("Cycle detected in graph!");
+
+        Ok(CompiledGraph::Cpu {
+            order,
+            graph,
+            ghost: PhantomData,
+        })
+    }
+
+    fn run_graph<S: Shape, T: DType, D: Dev>(
+        &self,
+        graph: &CompiledGraph<S, T, D>,
+    ) -> Result<Self::Storage<T>> {
+        {
+            // Create a shared buffer pool
+            let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));
 
-            // Compute topological order
-            let order = toposort(&dep_graph, None).expect("Cycle detected in graph!");
+            #[allow(irrefutable_let_patterns)]
+            let CompiledGraph::Cpu {
+                order,
+                graph,
+                ghost: _,
+            } = graph
+            else {
+                unreachable!()
+            };
 
             // Prepare storage for intermediate results
             let mut results: Vec<Option<PooledBuffer<T>>> = Vec::with_capacity(graph.len());
             results.resize_with(graph.len(), || None);
 
             // Evaluate nodes in topological order
-            for idx in order {
+            for idx in order.clone() {
                 let op = &graph[idx];
 
                 let out_shape = &op.shape;

diff --git a/constensor-core/src/cuda_backend/error.rs b/constensor-core/src/cuda_backend/error.rs
@@ -6,6 +6,12 @@ pub enum CudaError {
     #[error(transparent)]
     Cuda(#[from] cudarc::driver::DriverError),
 
+    #[error(transparent)]
+    Compiler(#[from] cudarc::nvrtc::CompileError),
+
+    #[error(transparent)]
+    Cublas(#[from] cudarc::cublas::result::CublasError),
+
     #[error("{cuda} when loading {module_name}")]
     Load {
         cuda: cudarc::driver::DriverError,
@@ -34,11 +40,3 @@ impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
         self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt())
     }
 }
-
-impl<O> WrapErr<O> for std::result::Result<O, CompileError> {
-    fn w(self) -> std::result::Result<O, crate::Error> {
-        self.map_err(|e| {
-            crate::Error::Cuda(Box::new(CudaError::PtxCompileError { err: e }).into()).bt()
-        })
-    }
-}