Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[target.x86_64-unknown-linux-gnu]
rustflags = [
"-C", "target-cpu=native",
"-C", "target-feature=+fp16"
]

[target.aarch64-apple-darwin]
[build]
rustflags = [
"-C", "target-cpu=native",
"-C", "target-feature=+fp16"
]

[target.wasm32-unknown-unknown]
rustflags = ["-C", "target-feature=+simd128"]

[target.x86_64-apple-darwin]
rustflags = ["-C", "target-feature=-avx,-avx2"]
10 changes: 6 additions & 4 deletions constensor-core/examples/hello_world/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@ use constensor_core::{Cpu, Graph, GraphTensor, Tensor, R1, R2};

fn main() {
let mut graph: Graph<f32> = Graph::empty();
let arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
dbg!(&arange.to_tensor().unwrap().data());
let _arange = GraphTensor::<R1<10>, f32, Cpu>::arange(&mut graph, 0., 1.);
let a = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 1.0);
let b = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 2.0);
let c = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 3.0);
let d = GraphTensor::<R2<3, 4>, f32, Cpu>::fill(&mut graph, 4.0);
let res = a * b + c;
let res = res + d;
let _out = res + d;

graph.optimize();

graph.visualize("graph.png").unwrap();

let tensor: Tensor<R2<3, 4>, f32, Cpu> = res.to_tensor().unwrap();
let compiled: constensor_core::CompiledGraph<R2<3, 4>, f32, Cpu> = graph.compile().unwrap();
let res = compiled.run().unwrap();

let tensor: Tensor<R2<3, 4>, f32, Cpu> = res;

assert_eq!(tensor.data().unwrap().to_vec(), vec![vec![9.0; 4]; 3],);
}
21 changes: 11 additions & 10 deletions constensor-core/examples/matmul/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use constensor_core::{Cpu, DType, Graph, GraphTensor, R3};
use constensor_core::{CompiledGraph, Cpu, DType, Graph, GraphTensor, R3};
use std::time::Instant;

fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usize>(
Expand All @@ -10,18 +10,19 @@ fn bench<T: DType, const B: usize, const M: usize, const K: usize, const N: usiz
let iterations = 1000;
let mut total = std::time::Duration::new(0, 0);

for _ in 0..iterations {
let start = Instant::now();
let mut graph = Graph::empty();
let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
let _c = a.matmul_axpby(b, o, alpha, beta);

let mut graph = Graph::empty();
let a = GraphTensor::<R3<B, M, K>, T, Cpu>::ones(&mut graph);
let b = GraphTensor::<R3<B, K, N>, T, Cpu>::ones(&mut graph);
let o = GraphTensor::<R3<B, M, N>, T, Cpu>::ones(&mut graph);
let c = a.matmul_axpby(b, o, alpha, beta);
graph.optimize();
let compiled: CompiledGraph<R3<B, M, N>, T, Cpu> = graph.compile().unwrap();

graph.optimize();
for _ in 0..iterations {
let start = Instant::now();

let _tensor = std::hint::black_box(c.to_tensor().unwrap());
let _tensor = std::hint::black_box(compiled.run().unwrap());

total += start.elapsed();
}
Expand Down
96 changes: 61 additions & 35 deletions constensor-core/src/cpu_storage/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
use petgraph::algo::toposort;
use petgraph::graphmap::DiGraphMap;
use std::borrow::Cow;
use std::cell::RefCell;
use std::rc::Rc;
use std::{borrow::Cow, marker::PhantomData};

use pool::{BufferPool, PooledBuffer};
use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator};

use crate::device::Dev;
use crate::Shape;
use crate::{
storage::{BackendDevice, BackendStorage},
DType, GraphNode, Op, Result,
CompiledGraph, DType, GraphNode, Op, Result,
};

mod pool;
Expand All @@ -29,49 +31,73 @@ impl<T: DType> BackendStorage<T> for CpuStorage<T> {
impl BackendDevice for CpuDevice {
type Storage<X: DType> = CpuStorage<X>;

fn compile_and_run_graph<T: DType>(&self, graph: &[GraphNode<T>]) -> Result<Self::Storage<T>> {
{
// Create a shared buffer pool
let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));

// Build a dependency graph of tensor indices
let mut dep_graph = DiGraphMap::<usize, ()>::new();
for idx in 0..graph.len() {
dep_graph.add_node(idx);
}
fn compile<S: Shape, T: DType, D: Dev>(
&self,
graph: Vec<GraphNode<T>>,
) -> Result<CompiledGraph<S, T, D>> {
// Build a dependency graph of tensor indices
let mut dep_graph = DiGraphMap::<usize, ()>::new();
for idx in 0..graph.len() {
dep_graph.add_node(idx);
}

for (idx, node) in graph.iter().enumerate() {
match &node.op {
Op::BinaryOp { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
Op::UnaryOp { v_id, .. } => {
dep_graph.add_edge(v_id.get(), idx, ());
}
Op::FusedMulAdd { a_id, b_id, c_id } => {
dep_graph.add_edge(a_id.get(), idx, ());
dep_graph.add_edge(b_id.get(), idx, ());
dep_graph.add_edge(c_id.get(), idx, ());
}
Op::MatMul { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
// NoOp and Fill/Arange don’t create incoming edges
Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
for (idx, node) in graph.iter().enumerate() {
match &node.op {
Op::BinaryOp { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
Op::UnaryOp { v_id, .. } => {
dep_graph.add_edge(v_id.get(), idx, ());
}
Op::FusedMulAdd { a_id, b_id, c_id } => {
dep_graph.add_edge(a_id.get(), idx, ());
dep_graph.add_edge(b_id.get(), idx, ());
dep_graph.add_edge(c_id.get(), idx, ());
}
Op::MatMul { l_id, r_id, .. } => {
dep_graph.add_edge(l_id.get(), idx, ());
dep_graph.add_edge(r_id.get(), idx, ());
}
// NoOp and Fill/Arange don’t create incoming edges
Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
}
}

// Compute topological order
let order = toposort(&dep_graph, None).expect("Cycle detected in graph!");

Ok(CompiledGraph::Cpu {
order,
graph,
ghost: PhantomData,
})
}

fn run_graph<S: Shape, T: DType, D: Dev>(
&self,
graph: &CompiledGraph<S, T, D>,
) -> Result<Self::Storage<T>> {
{
// Create a shared buffer pool
let pool = Rc::new(RefCell::new(BufferPool::<T>::new()));

// Compute topological order
let order = toposort(&dep_graph, None).expect("Cycle detected in graph!");
#[allow(irrefutable_let_patterns)]
let CompiledGraph::Cpu {
order,
graph,
ghost: _,
} = graph
else {
unreachable!()
};

// Prepare storage for intermediate results
let mut results: Vec<Option<PooledBuffer<T>>> = Vec::with_capacity(graph.len());
results.resize_with(graph.len(), || None);

// Evaluate nodes in topological order
for idx in order {
for idx in order.clone() {
let op = &graph[idx];

let out_shape = &op.shape;
Expand Down
14 changes: 6 additions & 8 deletions constensor-core/src/cuda_backend/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ pub enum CudaError {
#[error(transparent)]
Cuda(#[from] cudarc::driver::DriverError),

#[error(transparent)]
Compiler(#[from] cudarc::nvrtc::CompileError),

#[error(transparent)]
Cublas(#[from] cudarc::cublas::result::CublasError),

#[error("{cuda} when loading {module_name}")]
Load {
cuda: cudarc::driver::DriverError,
Expand Down Expand Up @@ -34,11 +40,3 @@ impl<O, E: Into<CudaError>> WrapErr<O> for std::result::Result<O, E> {
self.map_err(|e| crate::Error::Cuda(Box::new(e.into())).bt())
}
}

impl<O> WrapErr<O> for std::result::Result<O, CompileError> {
fn w(self) -> std::result::Result<O, crate::Error> {
self.map_err(|e| {
crate::Error::Cuda(Box::new(CudaError::PtxCompileError { err: e }).into()).bt()
})
}
}
Loading
Loading