diff --git a/.cargo/config.toml b/.cargo/config.toml index 5464853..c7a111c 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,2 +1,18 @@ - [build] - incremental = false \ No newline at end of file +[target.x86_64-unknown-linux-gnu] +rustflags = [ + "-C", "target-cpu=native", + "-C", "target-feature=+fp16" +] + +[target.aarch64-apple-darwin] +[build] +rustflags = [ + "-C", "target-cpu=native", + "-C", "target-feature=+fp16" +] + +[target.wasm32-unknown-unknown] +rustflags = ["-C", "target-feature=+simd128"] + +[target.x86_64-apple-darwin] +rustflags = ["-C", "target-feature=-avx,-avx2"] \ No newline at end of file diff --git a/constensor-core/Cargo.toml b/constensor-core/Cargo.toml index 0588eb1..5f80b6c 100644 --- a/constensor-core/Cargo.toml +++ b/constensor-core/Cargo.toml @@ -22,7 +22,7 @@ rand.workspace = true rand_distr.workspace = true [features] -default = ["half", "bfloat"] +default = [] all = ["cuda", "half", "bfloat"] cuda = ["cudarc"] half = ["dep:half"] diff --git a/constensor-core/src/cuda_backend/error.rs b/constensor-core/src/cuda_backend/error.rs index 7d85f21..2970829 100644 --- a/constensor-core/src/cuda_backend/error.rs +++ b/constensor-core/src/cuda_backend/error.rs @@ -12,6 +12,9 @@ pub enum CudaError { #[error(transparent)] Cublas(#[from] cudarc::cublas::result::CublasError), + #[error(transparent)] + Curand(#[from] cudarc::curand::result::CurandError), + #[error("{cuda} when loading {module_name}")] Load { cuda: cudarc::driver::DriverError, diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs index ebbf94e..bf62c8d 100644 --- a/constensor-core/src/cuda_backend/mod.rs +++ b/constensor-core/src/cuda_backend/mod.rs @@ -7,7 +7,10 @@ use cudarc::{ }; use error::WrapErr; use petgraph::{algo::toposort, prelude::DiGraphMap}; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, RwLock, +}; use std::{ borrow::Cow, collections::{HashMap, HashSet}, @@ -16,7 +19,6 @@ use std::{ marker::PhantomData, ops::Deref, path::{Path, PathBuf}, - sync::{Arc, RwLock}, }; use crate::{ @@ -29,6 +31,9 @@ use crate::{ pub(crate) mod error; pub(crate) mod util; +pub struct CudaRng(cudarc::curand::CudaRng); +unsafe impl Send for CudaRng {} + #[derive(Clone)] pub struct CudaDevice { context: Arc, @@ -123,6 +128,20 @@ pub enum CudaCompiledKernel { cublas: cudarc::cublas::CudaBlas, stream: Arc, }, + Rand { + rng: Arc>, + stream: Arc, + elem_count: usize, + order: usize, + }, + Randn { + mean: T, + std: T, + rng: Arc>, + stream: Arc, + elem_count: usize, + order: usize, + }, } #[derive(Debug)] @@ -190,7 +209,9 @@ fn handle_node( format!("( static_cast(fma(static_cast({a_name}), static_cast({b_name}), static_cast({c_name}))))") } Op::NoOp => unreachable!("no-op ops should never be reached."), - Op::MatMul { .. } => unreachable!("matmul op should have its own split!"), + Op::MatMul { .. } | Op::Rand | Op::Randn { .. } => { + unreachable!("op should have its own split!") + } } } @@ -366,8 +387,8 @@ impl BackendDevice for CudaDevice { dep_graph.add_edge(l_id.get(), idx, ()); dep_graph.add_edge(r_id.get(), idx, ()); } - // NoOp and Fill/Arange don’t create incoming edges - Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {} + // These don’t create incoming edges + Op::NoOp | Op::Fill { .. } | Op::Rand | Op::Randn { .. } | Op::Arange { .. } => {} } } @@ -429,6 +450,34 @@ impl BackendDevice for CudaDevice { stream, }); } + Op::Rand => { + let stream = self.select_stream(); + let curand = Arc::new(Mutex::new(CudaRng( + cudarc::curand::CudaRng::new(0, stream.clone()).w()?, + ))); + + matmuls.push(CudaCompiledKernel::Rand { + rng: curand, + stream, + elem_count: graph[idx].shape.iter().product(), + order: idx, + }); + } + Op::Randn { mean, std } => { + let stream = self.select_stream(); + let curand = Arc::new(Mutex::new(CudaRng( + cudarc::curand::CudaRng::new(0, stream.clone()).w()?, + ))); + + matmuls.push(CudaCompiledKernel::Randn { + mean: *mean, + std: *std, + rng: curand, + stream, + elem_count: graph[idx].shape.iter().product(), + order: idx, + }); + } _ => { let shape_key = graph[idx].shape.clone(); let should_group = if let Some((last_group, _)) = splits.last_mut() { @@ -546,6 +595,48 @@ impl BackendDevice for CudaDevice { }; last_storage.insert(order, storage); } + CudaCompiledKernel::Rand { + stream, + rng, + elem_count, + order, + } => { + let mut slice = unsafe { stream.alloc::(*elem_count).w()? }; + T::cuda_fill_with_uniform(&rng.lock().unwrap().0, &mut slice)?; + + // Record completion event for the MatMul result + let event = self.context.new_event(None).w()?; + event.record(stream).w()?; + + let storage = CudaStorage { + slice, + device: self.clone(), + event, + }; + last_storage.insert(order, storage); + } + CudaCompiledKernel::Randn { + mean, + std, + stream, + rng, + elem_count, + order, + } => { + let mut slice = unsafe { stream.alloc::(*elem_count).w()? }; + T::cuda_fill_with_normal(&rng.lock().unwrap().0, &mut slice, *mean, *std)?; + + // Record completion event for the MatMul result + let event = self.context.new_event(None).w()?; + event.record(stream).w()?; + + let storage = CudaStorage { + slice, + device: self.clone(), + event, + }; + last_storage.insert(order, storage); + } } } diff --git a/constensor-core/src/dtype/gemm.rs b/constensor-core/src/dtype/gemm.rs index 57c22f1..b8870fb 100644 --- a/constensor-core/src/dtype/gemm.rs +++ b/constensor-core/src/dtype/gemm.rs @@ -323,6 +323,8 @@ instantiate_gemm!(i64, 0, SIMD); instantiate_gemm!(f32, 0., GEMM); instantiate_gemm!(f64, 0., GEMM); #[cfg(feature = "bfloat")] -instantiate_gemm!(bf16, bf16::from_f32(0.), SIMD); +// Use naive implementation for bf16 to avoid CPU SIMD half-precision assembly requirements +instantiate_gemm!(bf16, bf16::from_f32(0.), NAIVE); #[cfg(feature = "half")] -instantiate_gemm!(f16, f16::from_f32(0.), GEMM); +// Use naive implementation for f16 to avoid CPU SIMD half-precision assembly requirements +instantiate_gemm!(f16, f16::from_f32(0.), NAIVE); diff --git a/constensor-core/src/dtype/mod.rs b/constensor-core/src/dtype/mod.rs index 56bf6bd..bee9dc5 100644 --- a/constensor-core/src/dtype/mod.rs +++ b/constensor-core/src/dtype/mod.rs @@ -3,7 +3,6 @@ use std::{ ops::{Add, Div, Mul, Sub}, }; -use gemm::GemmDispatch; #[cfg(feature = "bfloat")] use half::bf16; #[cfg(feature = "half")] @@ -11,9 +10,13 @@ use half::f16; #[cfg(feature = "cuda")] use cudarc::driver::DeviceRepr; + +use gemm::GemmDispatch; +use rand::RandDispatch; use simd_ops::SimdSupported; mod gemm; +mod rand; mod simd_ops; /// Type which can be square-rooted. @@ -89,6 +92,7 @@ pub trait DTypeOps: + Sqrtable + SimdSupported + GemmDispatch + + RandDispatch { } diff --git a/constensor-core/src/dtype/rand.rs b/constensor-core/src/dtype/rand.rs new file mode 100644 index 0000000..605b70c --- /dev/null +++ b/constensor-core/src/dtype/rand.rs @@ -0,0 +1,187 @@ +#[cfg(feature = "cuda")] +use { + super::DType, + crate::{cuda_backend::error::WrapErr, Result}, + cudarc::{curand::CudaRng, driver::CudaSlice}, +}; +// Optional half-precision types +#[cfg(feature = "bfloat")] +use half::bf16; +#[cfg(feature = "half")] +use half::f16; + +/// Dispatch random fills based on the data type (CUDA backend). +pub trait RandDispatch { + /// Fill the slice with uniform random values on the GPU. + #[cfg(feature = "cuda")] + fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice) -> Result<()> + where + Self: Sized; + + /// Fill the slice with normal (Gaussian) random values on the GPU. + #[cfg(feature = "cuda")] + fn cuda_fill_with_normal( + rng: &CudaRng, + slice: &mut CudaSlice, + mean: Self, + std: Self, + ) -> Result<()> + where + Self: Sized; +} + +// f32: support both uniform and normal +#[cfg(feature = "cuda")] +impl RandDispatch for f32 { + fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice) -> Result<()> { + rng.fill_with_uniform(slice).w() + } + fn cuda_fill_with_normal( + rng: &CudaRng, + slice: &mut CudaSlice, + mean: Self, + std: Self, + ) -> Result<()> { + rng.fill_with_normal(slice, mean, std).w() + } +} + +// f64: support both uniform and normal +#[cfg(feature = "cuda")] +impl RandDispatch for f64 { + fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice) -> Result<()> { + rng.fill_with_uniform(slice).w() + } + fn cuda_fill_with_normal( + rng: &CudaRng, + slice: &mut CudaSlice, + mean: Self, + std: Self, + ) -> Result<()> { + rng.fill_with_normal(slice, mean, std).w() + } +} + +// u32: uniform only +#[cfg(feature = "cuda")] +impl RandDispatch for u32 { + fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice) -> Result<()> { + rng.fill_with_uniform(slice).w() + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} + +// All other integral or half types: unsupported +#[cfg(feature = "cuda")] +impl RandDispatch for u8 { + fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice) -> Result<()> { + crate::bail!( + "Uniform random fill is not supported for dtype {}", + Self::C_NAME + ) + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} +#[cfg(feature = "cuda")] +impl RandDispatch for i32 { + fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice) -> Result<()> { + crate::bail!( + "Uniform random fill is not supported for dtype {}", + Self::C_NAME + ) + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} +#[cfg(feature = "cuda")] +impl RandDispatch for i64 { + fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice) -> Result<()> { + crate::bail!( + "Uniform random fill is not supported for dtype {}", + Self::C_NAME + ) + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} +#[cfg(all(feature = "cuda", feature = "half"))] +impl RandDispatch for f16 { + fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice) -> Result<()> { + crate::bail!( + "Uniform random fill is not supported for dtype {}", + Self::C_NAME + ) + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} +#[cfg(all(feature = "cuda", feature = "bfloat"))] +impl RandDispatch for bf16 { + fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice) -> Result<()> { + crate::bail!( + "Uniform random fill is not supported for dtype {}", + Self::C_NAME + ) + } + fn cuda_fill_with_normal( + _rng: &CudaRng, + _slice: &mut CudaSlice, + _mean: Self, + _std: Self, + ) -> Result<()> { + crate::bail!( + "Normal random fill is not supported for dtype {}", + Self::C_NAME + ) + } +} + +#[cfg(not(feature = "cuda"))] +impl RandDispatch for T {} diff --git a/constensor-core/src/tensor/graphtensor.rs b/constensor-core/src/tensor/graphtensor.rs index e13bd75..614a22f 100644 --- a/constensor-core/src/tensor/graphtensor.rs +++ b/constensor-core/src/tensor/graphtensor.rs @@ -5,7 +5,7 @@ use std::{ }; use crate::{ - device::{Cpu, Dev}, + device::Dev, graph::{BinaryOpType, Graph, GraphTensorId, Op, UnaryOpType}, DType, Shape, R1, R3, }; @@ -113,6 +113,29 @@ impl GraphTensor { _ghost: PhantomData, } } + #[must_use] + /// Create a tensor filled with uniform random values in [0,1). + pub fn rand(graph: &mut Graph) -> Self { + let id = graph.next_id(); + graph.add_op::(Op::Rand); + GraphTensor { + id, + graph: Arc::new(RwLock::new(graph.clone())), + _ghost: PhantomData, + } + } + + #[must_use] + /// Create a tensor filled with normally distributed random values (mean, std). + pub fn randn(graph: &mut Graph, mean: T, std: T) -> Self { + let id = graph.next_id(); + graph.add_op::(Op::Randn { mean, std }); + GraphTensor { + id, + graph: Arc::new(RwLock::new(graph.clone())), + _ghost: PhantomData, + } + } } impl GraphTensor { @@ -146,33 +169,6 @@ impl GraphTensor, T, D> { } } -// CPU-only random initializations -impl GraphTensor { - #[must_use] - /// Create a tensor filled with uniform random values in [0,1). - pub fn rand(graph: &mut Graph) -> Self { - let id = graph.next_id(); - graph.add_op::(Op::Rand); - GraphTensor { - id, - graph: Arc::new(RwLock::new(graph.clone())), - _ghost: PhantomData, - } - } - - #[must_use] - /// Create a tensor filled with normally distributed random values (mean, std). - pub fn randn(graph: &mut Graph, mean: T, std: T) -> Self { - let id = graph.next_id(); - graph.add_op::(Op::Randn { mean, std }); - GraphTensor { - id, - graph: Arc::new(RwLock::new(graph.clone())), - _ghost: PhantomData, - } - } -} - macro_rules! graphtensor_binop { ($trait:ident, $fn_name:ident) => { impl $trait for GraphTensor { diff --git a/constensor-core/tests/ops.rs b/constensor-core/tests/ops.rs index 8674f7a..aaf9f03 100644 --- a/constensor-core/tests/ops.rs +++ b/constensor-core/tests/ops.rs @@ -370,23 +370,36 @@ test_for_device_sqrt!(Cpu, cpu_tests_sqrt); #[cfg(feature = "cuda")] test_for_device_sqrt!(Cuda<0>, cuda_tests_sqrt); -#[test] -fn cpu_rand_uniform() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, f32, Cpu>::rand(&mut graph); - let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - let data = tensor.data().unwrap().to_vec(); - for &v in &data { - assert!((0.0..1.0).contains(&v), "value {v} out of [0,1)"); - } -} +macro_rules! test_for_device_rand { + ($dev:ty, $name:ident) => { + mod $name { + use super::*; + #[test] + fn rand_uniform() { + let mut graph = Graph::empty(); + let _x: GraphTensor, f32, $dev> = + GraphTensor::, f32, $dev>::rand(&mut graph); + let compiled: CompiledGraph, f32, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let data = tensor.data().unwrap().to_vec(); + for &v in &data { + assert!((0.0..1.0).contains(&v), "value {v} out of [0,1)"); + } + } -#[test] -fn cpu_randn_zero_std() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, f32, Cpu>::randn(&mut graph, PI, 0.0); - let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - assert_eq!(tensor.data().unwrap().to_vec(), vec![PI; 5]); + #[test] + fn randn_zero_std() { + let mut graph = Graph::empty(); + let _x: GraphTensor, f32, $dev> = + GraphTensor::, f32, $dev>::randn(&mut graph, PI, 0.0); + let compiled: CompiledGraph, f32, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + assert_eq!(tensor.data().unwrap().to_vec(), vec![PI; 8]); + } + } + }; } + +test_for_device_rand!(Cpu, cpu_tests_rand); +#[cfg(feature = "cuda")] +test_for_device_rand!(Cuda<0>, cuda_tests_rand);