diff --git a/.cargo/config.toml b/.cargo/config.toml
index 5464853..c7a111c 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,2 +1,18 @@
- [build]
- incremental = false
\ No newline at end of file
+[target.x86_64-unknown-linux-gnu]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.aarch64-apple-darwin]
+[build]
+rustflags = [
+  "-C", "target-cpu=native",
+  "-C", "target-feature=+fp16"
+]
+
+[target.wasm32-unknown-unknown]
+rustflags = ["-C", "target-feature=+simd128"]
+
+[target.x86_64-apple-darwin]
+rustflags = ["-C", "target-feature=-avx,-avx2"]
\ No newline at end of file
diff --git a/constensor-core/Cargo.toml b/constensor-core/Cargo.toml
index 0588eb1..5f80b6c 100644
--- a/constensor-core/Cargo.toml
+++ b/constensor-core/Cargo.toml
@@ -22,7 +22,7 @@ rand.workspace = true
 rand_distr.workspace = true
 
 [features]
-default = ["half", "bfloat"]
+default = []
 all = ["cuda", "half", "bfloat"]
 cuda = ["cudarc"]
 half = ["dep:half"]
diff --git a/constensor-core/src/cuda_backend/error.rs b/constensor-core/src/cuda_backend/error.rs
index 7d85f21..2970829 100644
--- a/constensor-core/src/cuda_backend/error.rs
+++ b/constensor-core/src/cuda_backend/error.rs
@@ -12,6 +12,9 @@ pub enum CudaError {
     #[error(transparent)]
     Cublas(#[from] cudarc::cublas::result::CublasError),
 
+    #[error(transparent)]
+    Curand(#[from] cudarc::curand::result::CurandError),
+
     #[error("{cuda} when loading {module_name}")]
     Load {
         cuda: cudarc::driver::DriverError,
diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs
index ebbf94e..bf62c8d 100644
--- a/constensor-core/src/cuda_backend/mod.rs
+++ b/constensor-core/src/cuda_backend/mod.rs
@@ -7,7 +7,10 @@ use cudarc::{
 };
 use error::WrapErr;
 use petgraph::{algo::toposort, prelude::DiGraphMap};
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, Mutex, RwLock,
+};
 use std::{
     borrow::Cow,
     collections::{HashMap, HashSet},
@@ -16,7 +19,6 @@ use std::{
     marker::PhantomData,
     ops::Deref,
     path::{Path, PathBuf},
-    sync::{Arc, RwLock},
 };
 
 use crate::{
@@ -29,6 +31,9 @@ use crate::{
 pub(crate) mod error;
 pub(crate) mod util;
 
+pub struct CudaRng(cudarc::curand::CudaRng);
+unsafe impl Send for CudaRng {}
+
 #[derive(Clone)]
 pub struct CudaDevice {
     context: Arc<cudarc::driver::CudaContext>,
@@ -123,6 +128,20 @@ pub enum CudaCompiledKernel<T: DType> {
         cublas: cudarc::cublas::CudaBlas,
         stream: Arc<CudaStream>,
     },
+    Rand {
+        rng: Arc<Mutex<CudaRng>>,
+        stream: Arc<CudaStream>,
+        elem_count: usize,
+        order: usize,
+    },
+    Randn {
+        mean: T,
+        std: T,
+        rng: Arc<Mutex<CudaRng>>,
+        stream: Arc<CudaStream>,
+        elem_count: usize,
+        order: usize,
+    },
 }
 
 #[derive(Debug)]
@@ -190,7 +209,9 @@ fn handle_node<T: DType>(
             format!("( static_cast<T>(fma(static_cast<double>({a_name}), static_cast<double>({b_name}), static_cast<double>({c_name}))))")
         }
         Op::NoOp => unreachable!("no-op ops should never be reached."),
-        Op::MatMul { .. } => unreachable!("matmul op should have its own split!"),
+        Op::MatMul { .. } | Op::Rand | Op::Randn { .. } => {
+            unreachable!("op should have its own split!")
+        }
     }
 }
 
@@ -366,8 +387,8 @@ impl BackendDevice for CudaDevice {
                     dep_graph.add_edge(l_id.get(), idx, ());
                     dep_graph.add_edge(r_id.get(), idx, ());
                 }
-                // NoOp and Fill/Arange don’t create incoming edges
-                Op::NoOp | Op::Fill { .. } | Op::Arange { .. } => {}
+                // These don’t create incoming edges
+                Op::NoOp | Op::Fill { .. } | Op::Rand | Op::Randn { .. } | Op::Arange { .. } => {}
             }
         }
 
@@ -429,6 +450,34 @@ impl BackendDevice for CudaDevice {
                         stream,
                     });
                 }
+                Op::Rand => {
+                    let stream = self.select_stream();
+                    let curand = Arc::new(Mutex::new(CudaRng(
+                        cudarc::curand::CudaRng::new(0, stream.clone()).w()?,
+                    )));
+
+                    matmuls.push(CudaCompiledKernel::Rand {
+                        rng: curand,
+                        stream,
+                        elem_count: graph[idx].shape.iter().product(),
+                        order: idx,
+                    });
+                }
+                Op::Randn { mean, std } => {
+                    let stream = self.select_stream();
+                    let curand = Arc::new(Mutex::new(CudaRng(
+                        cudarc::curand::CudaRng::new(0, stream.clone()).w()?,
+                    )));
+
+                    matmuls.push(CudaCompiledKernel::Randn {
+                        mean: *mean,
+                        std: *std,
+                        rng: curand,
+                        stream,
+                        elem_count: graph[idx].shape.iter().product(),
+                        order: idx,
+                    });
+                }
                 _ => {
                     let shape_key = graph[idx].shape.clone();
                     let should_group = if let Some((last_group, _)) = splits.last_mut() {
@@ -546,6 +595,48 @@ impl BackendDevice for CudaDevice {
                     };
                     last_storage.insert(order, storage);
                 }
+                CudaCompiledKernel::Rand {
+                    stream,
+                    rng,
+                    elem_count,
+                    order,
+                } => {
+                    let mut slice = unsafe { stream.alloc::<T>(*elem_count).w()? };
+                    T::cuda_fill_with_uniform(&rng.lock().unwrap().0, &mut slice)?;
+
+                    // Record completion event for the MatMul result
+                    let event = self.context.new_event(None).w()?;
+                    event.record(stream).w()?;
+
+                    let storage = CudaStorage {
+                        slice,
+                        device: self.clone(),
+                        event,
+                    };
+                    last_storage.insert(order, storage);
+                }
+                CudaCompiledKernel::Randn {
+                    mean,
+                    std,
+                    stream,
+                    rng,
+                    elem_count,
+                    order,
+                } => {
+                    let mut slice = unsafe { stream.alloc::<T>(*elem_count).w()? };
+                    T::cuda_fill_with_normal(&rng.lock().unwrap().0, &mut slice, *mean, *std)?;
+
+                    // Record completion event for the MatMul result
+                    let event = self.context.new_event(None).w()?;
+                    event.record(stream).w()?;
+
+                    let storage = CudaStorage {
+                        slice,
+                        device: self.clone(),
+                        event,
+                    };
+                    last_storage.insert(order, storage);
+                }
             }
         }
 
diff --git a/constensor-core/src/dtype/gemm.rs b/constensor-core/src/dtype/gemm.rs
index 57c22f1..b8870fb 100644
--- a/constensor-core/src/dtype/gemm.rs
+++ b/constensor-core/src/dtype/gemm.rs
@@ -323,6 +323,8 @@ instantiate_gemm!(i64, 0, SIMD);
 instantiate_gemm!(f32, 0., GEMM);
 instantiate_gemm!(f64, 0., GEMM);
 #[cfg(feature = "bfloat")]
-instantiate_gemm!(bf16, bf16::from_f32(0.), SIMD);
+// Use naive implementation for bf16 to avoid CPU SIMD half-precision assembly requirements
+instantiate_gemm!(bf16, bf16::from_f32(0.), NAIVE);
 #[cfg(feature = "half")]
-instantiate_gemm!(f16, f16::from_f32(0.), GEMM);
+// Use naive implementation for f16 to avoid CPU SIMD half-precision assembly requirements
+instantiate_gemm!(f16, f16::from_f32(0.), NAIVE);
diff --git a/constensor-core/src/dtype/mod.rs b/constensor-core/src/dtype/mod.rs
index 56bf6bd..bee9dc5 100644
--- a/constensor-core/src/dtype/mod.rs
+++ b/constensor-core/src/dtype/mod.rs
@@ -3,7 +3,6 @@ use std::{
     ops::{Add, Div, Mul, Sub},
 };
 
-use gemm::GemmDispatch;
 #[cfg(feature = "bfloat")]
 use half::bf16;
 #[cfg(feature = "half")]
@@ -11,9 +10,13 @@ use half::f16;
 
 #[cfg(feature = "cuda")]
 use cudarc::driver::DeviceRepr;
+
+use gemm::GemmDispatch;
+use rand::RandDispatch;
 use simd_ops::SimdSupported;
 
 mod gemm;
+mod rand;
 mod simd_ops;
 
 /// Type which can be square-rooted.
@@ -89,6 +92,7 @@ pub trait DTypeOps:
     + Sqrtable
     + SimdSupported
     + GemmDispatch
+    + RandDispatch
 {
 }
 
diff --git a/constensor-core/src/dtype/rand.rs b/constensor-core/src/dtype/rand.rs
new file mode 100644
index 0000000..605b70c
--- /dev/null
+++ b/constensor-core/src/dtype/rand.rs
@@ -0,0 +1,187 @@
+#[cfg(feature = "cuda")]
+use {
+    super::DType,
+    crate::{cuda_backend::error::WrapErr, Result},
+    cudarc::{curand::CudaRng, driver::CudaSlice},
+};
+// Optional half-precision types
+#[cfg(feature = "bfloat")]
+use half::bf16;
+#[cfg(feature = "half")]
+use half::f16;
+
+/// Dispatch random fills based on the data type (CUDA backend).
+pub trait RandDispatch {
+    /// Fill the slice with uniform random values on the GPU.
+    #[cfg(feature = "cuda")]
+    fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice<Self>) -> Result<()>
+    where
+        Self: Sized;
+
+    /// Fill the slice with normal (Gaussian) random values on the GPU.
+    #[cfg(feature = "cuda")]
+    fn cuda_fill_with_normal(
+        rng: &CudaRng,
+        slice: &mut CudaSlice<Self>,
+        mean: Self,
+        std: Self,
+    ) -> Result<()>
+    where
+        Self: Sized;
+}
+
+// f32: support both uniform and normal
+#[cfg(feature = "cuda")]
+impl RandDispatch for f32 {
+    fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice<Self>) -> Result<()> {
+        rng.fill_with_uniform(slice).w()
+    }
+    fn cuda_fill_with_normal(
+        rng: &CudaRng,
+        slice: &mut CudaSlice<Self>,
+        mean: Self,
+        std: Self,
+    ) -> Result<()> {
+        rng.fill_with_normal(slice, mean, std).w()
+    }
+}
+
+// f64: support both uniform and normal
+#[cfg(feature = "cuda")]
+impl RandDispatch for f64 {
+    fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice<Self>) -> Result<()> {
+        rng.fill_with_uniform(slice).w()
+    }
+    fn cuda_fill_with_normal(
+        rng: &CudaRng,
+        slice: &mut CudaSlice<Self>,
+        mean: Self,
+        std: Self,
+    ) -> Result<()> {
+        rng.fill_with_normal(slice, mean, std).w()
+    }
+}
+
+// u32: uniform only
+#[cfg(feature = "cuda")]
+impl RandDispatch for u32 {
+    fn cuda_fill_with_uniform(rng: &CudaRng, slice: &mut CudaSlice<Self>) -> Result<()> {
+        rng.fill_with_uniform(slice).w()
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+
+// All other integral or half types: unsupported
+#[cfg(feature = "cuda")]
+impl RandDispatch for u8 {
+    fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice<Self>) -> Result<()> {
+        crate::bail!(
+            "Uniform random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+#[cfg(feature = "cuda")]
+impl RandDispatch for i32 {
+    fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice<Self>) -> Result<()> {
+        crate::bail!(
+            "Uniform random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+#[cfg(feature = "cuda")]
+impl RandDispatch for i64 {
+    fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice<Self>) -> Result<()> {
+        crate::bail!(
+            "Uniform random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+#[cfg(all(feature = "cuda", feature = "half"))]
+impl RandDispatch for f16 {
+    fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice<Self>) -> Result<()> {
+        crate::bail!(
+            "Uniform random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+#[cfg(all(feature = "cuda", feature = "bfloat"))]
+impl RandDispatch for bf16 {
+    fn cuda_fill_with_uniform(_rng: &CudaRng, _slice: &mut CudaSlice<Self>) -> Result<()> {
+        crate::bail!(
+            "Uniform random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+    fn cuda_fill_with_normal(
+        _rng: &CudaRng,
+        _slice: &mut CudaSlice<Self>,
+        _mean: Self,
+        _std: Self,
+    ) -> Result<()> {
+        crate::bail!(
+            "Normal random fill is not supported for dtype {}",
+            Self::C_NAME
+        )
+    }
+}
+
+#[cfg(not(feature = "cuda"))]
+impl<T> RandDispatch for T {}
diff --git a/constensor-core/src/tensor/graphtensor.rs b/constensor-core/src/tensor/graphtensor.rs
index e13bd75..614a22f 100644
--- a/constensor-core/src/tensor/graphtensor.rs
+++ b/constensor-core/src/tensor/graphtensor.rs
@@ -5,7 +5,7 @@ use std::{
 };
 
 use crate::{
-    device::{Cpu, Dev},
+    device::Dev,
     graph::{BinaryOpType, Graph, GraphTensorId, Op, UnaryOpType},
     DType, Shape, R1, R3,
 };
@@ -113,6 +113,29 @@ impl<S: Shape, T: DType, D: Dev> GraphTensor<S, T, D> {
             _ghost: PhantomData,
         }
     }
+    #[must_use]
+    /// Create a tensor filled with uniform random values in [0,1).
+    pub fn rand(graph: &mut Graph<T>) -> Self {
+        let id = graph.next_id();
+        graph.add_op::<S>(Op::Rand);
+        GraphTensor {
+            id,
+            graph: Arc::new(RwLock::new(graph.clone())),
+            _ghost: PhantomData,
+        }
+    }
+
+    #[must_use]
+    /// Create a tensor filled with normally distributed random values (mean, std).
+    pub fn randn(graph: &mut Graph<T>, mean: T, std: T) -> Self {
+        let id = graph.next_id();
+        graph.add_op::<S>(Op::Randn { mean, std });
+        GraphTensor {
+            id,
+            graph: Arc::new(RwLock::new(graph.clone())),
+            _ghost: PhantomData,
+        }
+    }
 }
 
 impl<S: Shape, T: DType, D: Dev> GraphTensor<S, T, D> {
@@ -146,33 +169,6 @@ impl<const A: usize, T: DType, D: Dev> GraphTensor<R1<A>, T, D> {
     }
 }
 
-// CPU-only random initializations
-impl<S: Shape, T: DType> GraphTensor<S, T, Cpu> {
-    #[must_use]
-    /// Create a tensor filled with uniform random values in [0,1).
-    pub fn rand(graph: &mut Graph<T>) -> Self {
-        let id = graph.next_id();
-        graph.add_op::<S>(Op::Rand);
-        GraphTensor {
-            id,
-            graph: Arc::new(RwLock::new(graph.clone())),
-            _ghost: PhantomData,
-        }
-    }
-
-    #[must_use]
-    /// Create a tensor filled with normally distributed random values (mean, std).
-    pub fn randn(graph: &mut Graph<T>, mean: T, std: T) -> Self {
-        let id = graph.next_id();
-        graph.add_op::<S>(Op::Randn { mean, std });
-        GraphTensor {
-            id,
-            graph: Arc::new(RwLock::new(graph.clone())),
-            _ghost: PhantomData,
-        }
-    }
-}
-
 macro_rules! graphtensor_binop {
     ($trait:ident, $fn_name:ident) => {
         impl<S: Shape, T: DType, D: Dev> $trait for GraphTensor<S, T, D> {
diff --git a/constensor-core/tests/ops.rs b/constensor-core/tests/ops.rs
index 8674f7a..aaf9f03 100644
--- a/constensor-core/tests/ops.rs
+++ b/constensor-core/tests/ops.rs
@@ -370,23 +370,36 @@ test_for_device_sqrt!(Cpu, cpu_tests_sqrt);
 #[cfg(feature = "cuda")]
 test_for_device_sqrt!(Cuda<0>, cuda_tests_sqrt);
 
-#[test]
-fn cpu_rand_uniform() {
-    let mut graph = Graph::empty();
-    let _x = GraphTensor::<R1<5>, f32, Cpu>::rand(&mut graph);
-    let compiled: CompiledGraph<R1<5>, f32, Cpu> = graph.compile().unwrap();
-    let tensor = compiled.run().unwrap();
-    let data = tensor.data().unwrap().to_vec();
-    for &v in &data {
-        assert!((0.0..1.0).contains(&v), "value {v} out of [0,1)");
-    }
-}
+macro_rules! test_for_device_rand {
+    ($dev:ty, $name:ident) => {
+        mod $name {
+            use super::*;
+            #[test]
+            fn rand_uniform() {
+                let mut graph = Graph::empty();
+                let _x: GraphTensor<R1<8>, f32, $dev> =
+                    GraphTensor::<R1<8>, f32, $dev>::rand(&mut graph);
+                let compiled: CompiledGraph<R1<8>, f32, $dev> = graph.compile().unwrap();
+                let tensor = compiled.run().unwrap();
+                let data = tensor.data().unwrap().to_vec();
+                for &v in &data {
+                    assert!((0.0..1.0).contains(&v), "value {v} out of [0,1)");
+                }
+            }
 
-#[test]
-fn cpu_randn_zero_std() {
-    let mut graph = Graph::empty();
-    let _x = GraphTensor::<R1<5>, f32, Cpu>::randn(&mut graph, PI, 0.0);
-    let compiled: CompiledGraph<R1<5>, f32, Cpu> = graph.compile().unwrap();
-    let tensor = compiled.run().unwrap();
-    assert_eq!(tensor.data().unwrap().to_vec(), vec![PI; 5]);
+            #[test]
+            fn randn_zero_std() {
+                let mut graph = Graph::empty();
+                let _x: GraphTensor<R1<8>, f32, $dev> =
+                    GraphTensor::<R1<8>, f32, $dev>::randn(&mut graph, PI, 0.0);
+                let compiled: CompiledGraph<R1<8>, f32, $dev> = graph.compile().unwrap();
+                let tensor = compiled.run().unwrap();
+                assert_eq!(tensor.data().unwrap().to_vec(), vec![PI; 8]);
+            }
+        }
+    };
 }
+
+test_for_device_rand!(Cpu, cpu_tests_rand);
+#[cfg(feature = "cuda")]
+test_for_device_rand!(Cuda<0>, cuda_tests_rand);