EricLBuehler · EricLBuehler · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -1,2 +1,18 @@
- [build]
- incremental = false
+[target.x86_64-unknown-linux-gnu]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.aarch64-apple-darwin]
+[build]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.wasm32-unknown-unknown]
+rustflags = ["-C", "target-feature=+simd128"]
+
+[target.x86_64-apple-darwin]
+rustflags = ["-C", "target-feature=-avx,-avx2"]
diff --git a/constensor-core/Cargo.toml b/constensor-core/Cargo.toml
@@ -22,7 +22,7 @@ rand.workspace = true
 rand_distr.workspace = true
 
 [features]
-default = ["half", "bfloat"]
+default = []
 all = ["cuda", "half", "bfloat"]
 cuda = ["cudarc"]
 half = ["dep:half"]

diff --git a/constensor-core/src/cuda_backend/error.rs b/constensor-core/src/cuda_backend/error.rs
@@ -12,6 +12,9 @@ pub enum CudaError {
     #[error(transparent)]
     Cublas(#[from] cudarc::cublas::result::CublasError),
 
+    #[error(transparent)]
+    Curand(#[from] cudarc::curand::result::CurandError),
+
     #[error("{cuda} when loading {module_name}")]
     Load {
         cuda: cudarc::driver::DriverError,

diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs
@@ -7,7 +7,10 @@ use cudarc::{
 };
 use error::WrapErr;
 use petgraph::{algo::toposort, prelude::DiGraphMap};
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, Mutex, RwLock,
+};
 use std::{
     borrow::Cow,
     collections::{HashMap, HashSet},
@@ -16,7 +19,6 @@ use std::{
     marker::PhantomData,
     ops::Deref,
     path::{Path, PathBuf},
-    sync::{Arc, RwLock},
 };
 
 use crate::{
@@ -29,6 +31,9 @@ use crate::{
 pub(crate) mod error;
 pub(crate) mod util;
 
+pub struct CudaRng(cudarc::curand::CudaRng);
+unsafe impl Send for CudaRng {}
+
 #[derive(Clone)]
 pub struct CudaDevice {
     context: Arc<cudarc::driver::CudaContext>,
@@ -123,6 +128,20 @@ pub enum CudaCompiledKernel<T: DType> {
         cublas: cudarc::cublas::CudaBlas,
         stream: Arc<CudaStream>,
     },
+    Rand {
+        rng: Arc<Mutex<CudaRng>>,
+        stream: Arc<CudaStream>,
+        elem_count: usize,
+        order: usize,
+    },
+    Randn {
+        mean: T,
+        std: T,
+        rng: Arc<Mutex<CudaRng>>,
+        stream: Arc<CudaStream>,
+        elem_count: usize,
+        order: usize,
+    },
 }
 
 #[derive(Debug)]
@@ -190,7 +209,9 @@ fn handle_node<T: DType>(
             format!("( static_cast<T>(fma(static_cast<double>({a_name}), static_cast<double>({b_name}), static_cast<double>({c_name}))))")
         }
         Op::NoOp => unreachable!("no-op ops should never be reached."),
-        Op::MatMul { .. } => unreachable!("matmul op should have its own split!"),
+        Op::MatMul { .. } | Op::Rand | Op::Randn { .. } => {
+            unreachable!("op should have its own split!")
+        }
     }
 }
 
@@ -366,8 +387,8 @@ impl BackendDevice for CudaDevice {
                     dep_graph.add_edge(l_id.get(), idx, ());
                     dep_graph.add_edge(r_id.get(), idx, ());
                 }
-                // NoOp and Fill/Arange don’t create incoming edges
-                Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
+                // These don’t create incoming edges
+                Op::NoOp | Op::Fill { .. } | Op::Rand | Op::Randn { .. } | Op::Arange { .. } => {}
             }
         }
 
@@ -429,6 +450,34 @@ impl BackendDevice for CudaDevice {
                         stream,
                     });
                 }
+                Op::Rand => {
+                    let stream = self.select_stream();
+                    let curand = Arc::new(Mutex::new(CudaRng(
+                        cudarc::curand::CudaRng::new(0, stream.clone()).w()?,
+                    )));
+
+                    matmuls.push(CudaCompiledKernel::Rand {
+                        rng: curand,
+                        stream,
+                        elem_count: graph[idx].shape.iter().product(),
+                        order: idx,
+                    });
+                }
+                Op::Randn { mean, std } => {
+                    let stream = self.select_stream();
+                    let curand = Arc::new(Mutex::new(CudaRng(
+                        cudarc::curand::CudaRng::new(0, stream.clone()).w()?,
+                    )));
+
+                    matmuls.push(CudaCompiledKernel::Randn {
+                        mean: *mean,
+                        std: *std,
+                        rng: curand,
+                        stream,
+                        elem_count: graph[idx].shape.iter().product(),
+                        order: idx,
+                    });
+                }
                 _ => {
                     let shape_key = graph[idx].shape.clone();
                     let should_group = if let Some((last_group, _)) = splits.last_mut() {
@@ -546,6 +595,48 @@ impl BackendDevice for CudaDevice {
                     };
                     last_storage.insert(order, storage);
                 }
+                CudaCompiledKernel::Rand {
+                    stream,
+                    rng,
+                    elem_count,
+                    order,
+                } => {
+                    let mut slice = unsafe { stream.alloc::<T>(*elem_count).w()? };
+                    T::cuda_fill_with_uniform(&rng.lock().unwrap().0, &mut slice)?;
+
+                    // Record completion event for the MatMul result
+                    let event = self.context.new_event(None).w()?;
+                    event.record(stream).w()?;
+
+                    let storage = CudaStorage {
+                        slice,
+                        device: self.clone(),
+                        event,
+                    };
+                    last_storage.insert(order, storage);
+                }
+                CudaCompiledKernel::Randn {
+                    mean,
+                    std,
+                    stream,
+                    rng,
+                    elem_count,
+                    order,
+                } => {
+                    let mut slice = unsafe { stream.alloc::<T>(*elem_count).w()? };
+                    T::cuda_fill_with_normal(&rng.lock().unwrap().0, &mut slice, *mean, *std)?;
+
+                    // Record completion event for the MatMul result
+                    let event = self.context.new_event(None).w()?;
+                    event.record(stream).w()?;
+
+                    let storage = CudaStorage {
+                        slice,
+                        device: self.clone(),
+                        event,
+                    };
+                    last_storage.insert(order, storage);
+                }
             }
         }
 

diff --git a/constensor-core/src/dtype/gemm.rs b/constensor-core/src/dtype/gemm.rs
@@ -323,6 +323,8 @@ instantiate_gemm!(i64, 0, SIMD);
 instantiate_gemm!(f32, 0., GEMM);
 instantiate_gemm!(f64, 0., GEMM);
 #[cfg(feature = "bfloat")]
-instantiate_gemm!(bf16, bf16::from_f32(0.), SIMD);
+// Use naive implementation for bf16 to avoid CPU SIMD half-precision assembly requirements
+instantiate_gemm!(bf16, bf16::from_f32(0.), NAIVE);
 #[cfg(feature = "half")]
-instantiate_gemm!(f16, f16::from_f32(0.), GEMM);
+// Use naive implementation for f16 to avoid CPU SIMD half-precision assembly requirements
+instantiate_gemm!(f16, f16::from_f32(0.), NAIVE);
diff --git a/constensor-core/src/dtype/mod.rs b/constensor-core/src/dtype/mod.rs
@@ -3,17 +3,20 @@ use std::{
     ops::{Add, Div, Mul, Sub},
 };
 
-use gemm::GemmDispatch;
 #[cfg(feature = "bfloat")]
 use half::bf16;
 #[cfg(feature = "half")]
 use half::f16;
 
 #[cfg(feature = "cuda")]
 use cudarc::driver::DeviceRepr;
+
+use gemm::GemmDispatch;
+use rand::RandDispatch;
 use simd_ops::SimdSupported;
 
 mod gemm;
+mod rand;
 mod simd_ops;
 
 /// Type which can be square-rooted.
@@ -89,6 +92,7 @@ pub trait DTypeOps:
     + Sqrtable
     + SimdSupported
     + GemmDispatch
+    + RandDispatch
 {
 }