Skip to content
Merged
300 changes: 298 additions & 2 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions constensor-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ half = { workspace = true, optional = true }
dirs = "5.0.1"
rayon = "1.10.0"
petgraph.workspace = true
gemm = "0.18"
num_cpus = "1.16.0"

[features]
default = ["half", "bfloat"]
Expand Down
12 changes: 7 additions & 5 deletions constensor-core/examples/hello_world/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ fn main() {
let mut graph: Graph<f32> = Graph::empty();
let arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
dbg!(&arange.to_tensor().unwrap().data());
let x = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
let y = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
let z = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
let res = y * x + z;
let a = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
let b = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
let c = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 3.0);
let d = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 4.0);
let res = a * b + c;
let res = res + d;

graph.optimize();

graph.visualize("graph.png").unwrap();

let tensor: Tensor<R2<3, 4>, f32, Cpu> = res.to_tensor().unwrap();

assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![4.0; 4]; 3],);
assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![9.0; 4]; 3],);
}
41 changes: 41 additions & 0 deletions constensor-core/examples/matmul/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use constensor_core::{Cpu, DType, Graph, GraphTensor, R3};
use std::time::Instant;

fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
type_name: &str,
alpha: T,
beta: T,
) {
// Number of times to run the matmul for averaging
let iterations = 1000;
let mut total = std::time::Duration::new(0, 0);

for _ in 0..iterations {
let start = Instant::now();

let mut graph = Graph::empty();
let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
let c = a.matmul_axpby(b, o, alpha, beta);

graph.optimize();

let _tensor = std::hint::black_box(c.to_tensor().unwrap());

total += start.elapsed();
}

let avg = total / (iterations as u32);
println!("Average execution time for {type_name} over {iterations} iterations: {avg:?}");
}

fn main() {
const B: usize = 1;
const M: usize = 128;
const N: usize = 128;
const K: usize = 128;

bench::<f32, B, M, K, N>("f32", 1.0, 1.0);
bench::<i32, B, M, K, N>("i32", 1, 1);
}
179 changes: 89 additions & 90 deletions constensor-core/src/cpu_storage/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ use pool::{BufferPool, PooledBuffer};
use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};

use crate::{
graph::GraphTensorId,
storage::{BackendDevice, BackendStorage},
DType, Op, Result, Shape,
DType, GraphNode, Op, Result,
};

mod pool;
Expand All @@ -30,10 +29,7 @@ impl<T: DType> BackendStorage<T> for CpuStorage<T> {
impl BackendDevice for CpuDevice {
type Storage<X: DType> = CpuStorage<X>;

fn compile_and_run_graph<S: Shape, T: DType>(
&self,
graph: &[Op<T>],
) -> Result<Self::Storage<T>> {
fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Self::Storage<T>> {
{
// Create a shared buffer pool
let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));
Expand All @@ -45,27 +41,22 @@ impl BackendDevice for CpuDevice {
}

for (idx, node) in graph.iter().enumerate() {
match node {
Op::BinaryOp { l_id, r_id, .. } | Op::InplaceBinaryOp { l_id, r_id, .. } => {
let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
dep_graph.add_edge(l_idx, idx, ());
dep_graph.add_edge(r_idx, idx, ());
match &node.op {
Op::BinaryOp { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
Op::UnaryOp { v_id, .. } => {
let v_idx = <&GraphTensorId as Into<usize>>::into(v_id);
dep_graph.add_edge(v_idx, idx, ());
dep_graph.add_edge(v_id.get(), idx, ());
}
Op::FusedMulAdd { a_id, b_id, c_id }
| Op::InplaceFusedMulAdd {
a_id, b_id, c_id, ..
} => {
let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
dep_graph.add_edge(a_idx, idx, ());
dep_graph.add_edge(b_idx, idx, ());
dep_graph.add_edge(c_idx, idx, ());
Op::FusedMulAdd { a_id, b_id, c_id } => {
dep_graph.add_edge(a_id.get(), idx, ());
dep_graph.add_edge(b_id.get(), idx, ());
dep_graph.add_edge(c_id.get(), idx, ());
}
Op::MatMul { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
// NoOp and Fill/Arange don’t create incoming edges
Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
Expand All @@ -82,50 +73,41 @@ impl BackendDevice for CpuDevice {
// Evaluate nodes in topological order
for idx in order {
let op = &graph[idx];
let computed = match op {

let out_shape = &op.shape;
let out_elem_count: usize = out_shape.iter().product();

let computed = match &op.op {
Op::BinaryOp {
l_id,
r_id,
operator,
} => {
let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
let l_buf = results[l_idx].as_ref().unwrap();
let r_buf = results[r_idx].as_ref().unwrap();
let mut out = pool.borrow_mut().get_buffer(S::element_count());
T::binary_simd_op(l_buf, r_buf, &mut out, *operator);
PooledBuffer::new(out, pool.clone())
}
Op::InplaceBinaryOp {
out,
l_id,
r_id,
operator,
} => {
let l_idx = <&GraphTensorId as Into<usize>>::into(l_id);
let r_idx = <&GraphTensorId as Into<usize>>::into(r_id);
let o_idx = <&GraphTensorId as Into<usize>>::into(out);
if o_idx == l_idx {
let mut l_buf = results[l_idx].take().unwrap();
let r_buf = results[r_idx].as_ref().unwrap();
if l_id.is_inplace() {
let mut l_buf = results[l_id.get()].take().unwrap();
let r_buf = results[r_id.get()].as_ref().unwrap();
T::binary_simd_op_inplace_lhs(&mut l_buf, r_buf, *operator);
l_buf
} else if o_idx == r_idx {
let mut r_buf = results[r_idx].take().unwrap();
let l_buf = results[l_idx].as_ref().unwrap();
} else if r_id.is_inplace() {
let mut r_buf = results[r_id.get()].take().unwrap();
let l_buf = results[l_id.get()].as_ref().unwrap();
T::binary_simd_op_inplace_rhs(l_buf, &mut r_buf, *operator);
r_buf
} else {
unreachable!()
let l_buf = results[l_id.get()].as_ref().unwrap();
let r_buf = results[r_id.get()].as_ref().unwrap();
let mut out = pool.borrow_mut().get_buffer(out_elem_count);
T::binary_simd_op(l_buf, r_buf, &mut out, *operator);
PooledBuffer::new(out, pool.clone())
}
}
Op::Fill { v } => {
let mut buf = pool.borrow_mut().get_empty_buffer(S::element_count());
buf.extend(std::iter::repeat_n(*v, S::element_count()));
let mut buf = pool.borrow_mut().get_empty_buffer(out_elem_count);
buf.extend(std::iter::repeat_n(*v, out_elem_count));
PooledBuffer::new(buf, pool.clone())
}
Op::Arange { start, step, stop } => {
let mut buf = pool.borrow_mut().get_empty_buffer(S::element_count());
let mut buf = pool.borrow_mut().get_empty_buffer(out_elem_count);
let mut x = start.to_f64();
while x < stop.to_f64() {
buf.push(T::from_f64(x));
Expand All @@ -134,63 +116,80 @@ impl BackendDevice for CpuDevice {
PooledBuffer::new(buf, pool.clone())
}
Op::UnaryOp { v_id, operator } => {
let v_idx = <&GraphTensorId as Into<usize>>::into(v_id);
let buf = results[v_idx].as_ref().unwrap();
let buf = results[v_id.get()].as_ref().unwrap();
let op_fn = operator.to_closure();
let mut out = pool.borrow_mut().get_buffer(S::element_count());
let mut out = pool.borrow_mut().get_buffer(out_elem_count);
out.par_iter_mut()
.zip(&**buf)
.for_each(|(out, x): (&mut T, &T)| *out = op_fn(*x));
PooledBuffer::new(out, pool.clone())
}
Op::FusedMulAdd { a_id, b_id, c_id } => {
let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
let a_buf = results[a_idx].as_ref().unwrap();
let b_buf = results[b_idx].as_ref().unwrap();
let c_buf = results[c_idx].as_ref().unwrap();

let mut out = pool.borrow_mut().get_buffer(S::element_count());
T::fma_op(a_buf, b_buf, c_buf, &mut out);
PooledBuffer::new(out, pool.clone())
}
Op::InplaceFusedMulAdd {
a_id,
b_id,
c_id,
out,
} => {
let a_idx = <&GraphTensorId as Into<usize>>::into(a_id);
let b_idx = <&GraphTensorId as Into<usize>>::into(b_id);
let c_idx = <&GraphTensorId as Into<usize>>::into(c_id);
let o_idx = <&GraphTensorId as Into<usize>>::into(out);

if o_idx == a_idx {
let mut a_buf = results[a_idx].take().unwrap();
let b_buf = results[b_idx].as_ref().unwrap();
let c_buf = results[c_idx].as_ref().unwrap();
if a_id.is_inplace() {
let mut a_buf = results[a_id.get()].take().unwrap();
let b_buf = results[b_id.get()].as_ref().unwrap();
let c_buf = results[c_id.get()].as_ref().unwrap();

T::fma_op_inplace_a(&mut a_buf, b_buf, c_buf);
a_buf
} else if o_idx == b_idx {
let mut b_buf = results[b_idx].take().unwrap();
let a_buf = results[a_idx].as_ref().unwrap();
let c_buf = results[c_idx].as_ref().unwrap();
} else if b_id.is_inplace() {
let mut b_buf = results[b_id.get()].take().unwrap();
let a_buf = results[a_id.get()].as_ref().unwrap();
let c_buf = results[c_id.get()].as_ref().unwrap();

T::fma_op_inplace_b(a_buf, &mut b_buf, c_buf);
b_buf
} else if o_idx == c_idx {
let mut c_buf = results[c_idx].take().unwrap();
let a_buf = results[a_idx].as_ref().unwrap();
let b_buf = results[b_idx].as_ref().unwrap();
} else if c_id.is_inplace() {
let mut c_buf = results[c_id.get()].take().unwrap();
let a_buf = results[a_id.get()].as_ref().unwrap();
let b_buf = results[b_id.get()].as_ref().unwrap();

T::fma_op_inplace_c(a_buf, b_buf, &mut c_buf);
c_buf
} else {
unreachable!()
let a_buf = results[a_id.get()].as_ref().unwrap();
let b_buf = results[b_id.get()].as_ref().unwrap();
let c_buf = results[c_id.get()].as_ref().unwrap();

let mut out = pool.borrow_mut().get_buffer(out_elem_count);
T::fma_op(a_buf, b_buf, c_buf, &mut out);
PooledBuffer::new(out, pool.clone())
}
}
// Matrix multiplication: multiply two 2D tensors A (m x k) and B (k x n)
Op::MatMul {
l_id,
r_id,
o_id,
k,
alpha,
beta,
} => {
// Determine output dimensions from shape S (must be 2D)
let shape = out_shape;
assert!(shape.len() == 3);
let b = shape[0];
let m = shape[1];
let n = shape[2];

let mut out = if let Some(o_id) = o_id {
if o_id.is_inplace() {
results[o_id.get()].take().unwrap()
} else {
let o_buf = results[o_id.get()].as_ref().unwrap();
PooledBuffer::new((*o_buf).clone(), pool.clone())
}
} else {
PooledBuffer::new(pool.borrow_mut().get_buffer(m * n), pool.clone())
};

let a_buf = results[l_id.get()].as_ref().unwrap();
let b_buf = results[r_id.get()].as_ref().unwrap();

T::launch_gemm(a_buf, b_buf, b, m, n, *k, &mut out, *alpha, *beta);

out
}
Op::NoOp => unreachable!("NoOp should not be evaluated."),
};
results[idx] = Some(computed);
Expand Down
2 changes: 2 additions & 0 deletions constensor-core/src/cpu_storage/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pub struct PoolMetrics {
pub drops: usize,
}

#[derive(Debug)]
/// A simple buffer pool to reuse Vec allocations across graph evaluation.
pub struct BufferPool<T> {
pool: Vec<Vec<T>>,
Expand All @@ -32,6 +33,7 @@ pub struct BufferPool<T> {
/// Shared reference to a BufferPool for automatic recycling.
pub type SharedPool<T> = Rc<RefCell<BufferPool<T>>>;

#[derive(Debug)]
/// Wrapper around Vec<T> that returns its buffer to the pool on drop.
pub struct PooledBuffer<T: DType> {
buf: Vec<T>,
Expand Down
8 changes: 3 additions & 5 deletions constensor-core/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::cuda_backend::CudaDevice;
use crate::{
cpu_storage::CpuDevice,
storage::{BackendDevice, Storage},
DType, Op, Result, Shape,
DType, GraphNode, Result,
};

/// Marker trait for devices
Expand Down Expand Up @@ -66,13 +66,11 @@ pub enum Device {
}

impl Device {
pub fn compile_and_run_graph<T: DType, S: Shape>(&self, graph: &[Op<T>]) -> Result<Storage<T>> {
pub fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Storage<T>> {
match self {
#[cfg(feature = "cuda")]
Self::Cuda(cuda) => Ok(Storage::Cuda(cuda.compile_and_run_graph::<S, T>(graph)?)),
Self::Cpu => Ok(Storage::Cpu(
CpuDevice.compile_and_run_graph::<S, T>(graph)?,
)),
Self::Cpu => Ok(Storage::Cpu(CpuDevice.compile_and_run_graph::<T>(graph)?)),
}
}
}
Loading
Loading