From 502e6e92801de081a9b18d2a9f79a8b2f483b60b Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 08:20:19 -0400 Subject: [PATCH 1/9] Tensor casting for cpu --- constensor-core/src/cpu_storage/mod.rs | 8 ++++++-- constensor-core/src/storage.rs | 9 +++++++++ constensor-core/src/tensor/concretetensor.rs | 9 +++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/constensor-core/src/cpu_storage/mod.rs b/constensor-core/src/cpu_storage/mod.rs index 68111b8..ef03b9f 100644 --- a/constensor-core/src/cpu_storage/mod.rs +++ b/constensor-core/src/cpu_storage/mod.rs @@ -8,6 +8,7 @@ use pool::{BufferPool, PooledBuffer}; use rayon::iter::{IndexedParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; use crate::device::Dev; +use crate::storage::Storage; use crate::Shape; use crate::{ storage::{BackendDevice, BackendStorage}, @@ -26,8 +27,11 @@ pub struct CpuStorage(pub(crate) Vec); impl BackendStorage for CpuStorage { fn to_cpu_storage(&self) -> Result>> { - // Note: copying all data here. - Ok(Cow::Owned(self.clone())) + Ok(Cow::Borrowed(&self)) + } + fn cast(&self) -> Result> { + let new = self.0.iter().map(|x| U::from_f64(x.to_f64())); + Ok(Storage::Cpu(CpuStorage(new.collect()))) } } diff --git a/constensor-core/src/storage.rs b/constensor-core/src/storage.rs index 8394207..d274f07 100644 --- a/constensor-core/src/storage.rs +++ b/constensor-core/src/storage.rs @@ -18,10 +18,19 @@ impl Storage { Self::Cuda(cuda) => cuda.to_cpu_storage(), } } + + pub(crate) fn cast(&self) -> Result> { + match self { + Self::Cpu(cpu) => cpu.cast::(), + #[cfg(feature = "cuda")] + Self::Cuda(cuda) => cuda.cast::(), + } + } } pub trait BackendStorage { fn to_cpu_storage(&self) -> Result>>; + fn cast(&self) -> Result>; } pub trait BackendDevice { diff --git a/constensor-core/src/tensor/concretetensor.rs b/constensor-core/src/tensor/concretetensor.rs index 5d92107..3b07361 100644 --- a/constensor-core/src/tensor/concretetensor.rs +++ b/constensor-core/src/tensor/concretetensor.rs @@ -85,6 +85,15 @@ tensor_api!(Cpu); #[cfg(feature = "cuda")] tensor_api!(Cuda<0>); +impl Tensor { + /// Cast this tensor to a different dtype `U` on the CPU. + pub fn cast(&self) -> Result> { + // retrieve data from storage as owned Vec + let storage = self.storage.cast::()?; + Ok(from_storage::(Arc::new(storage))) + } +} + /*macro_rules! binary_op { ($trait:ident, $fn:ident) => { impl $trait for Tensor { From c229a773dd6b1268dfef07b62a628c8326d682dc Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 08:37:06 -0400 Subject: [PATCH 2/9] Add cast tests --- constensor-core/tests/cast.rs | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 constensor-core/tests/cast.rs diff --git a/constensor-core/tests/cast.rs b/constensor-core/tests/cast.rs new file mode 100644 index 0000000..80752db --- /dev/null +++ b/constensor-core/tests/cast.rs @@ -0,0 +1,50 @@ +use constensor_core::{Graph, GraphTensor, CompiledGraph, Cpu, R1, R2, R3}; + +// Test casting a 1D tensor from f32 to f64 +#[test] +fn cast_f32_to_f64_1d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f32, Cpu>::fill(&mut graph, 1.5); + let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![1.5_f64; 4]); +} + +// Test casting a 2D tensor from f64 to f32 +#[test] +fn cast_f64_to_f32_2d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f64, Cpu>::fill(&mut graph, 2.75); + let compiled: CompiledGraph, f64, Cpu> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![vec![2.75_f32; 3]; 2]); +} + +// Test casting a 3D tensor from i32 to f32 +#[test] +fn cast_i32_to_f32_3d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, i32, Cpu>::fill(&mut graph, 7); + let compiled: CompiledGraph, i32, Cpu> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + let expected = vec![vec![vec![7.0_f32; 3]; 2]; 1]; + assert_eq!(data, expected); +} + +// Test casting from f32 to i32 truncates toward zero +#[test] +fn cast_f32_to_i32_truncate() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f32, Cpu>::fill(&mut graph, 1.9); + let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![1_i32; 3]); +} \ No newline at end of file From 86243116e0b089172f64bff15e143c3472a7e2f9 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 12:38:19 +0000 Subject: [PATCH 3/9] Add cuda kernel --- constensor-core/src/cuda_backend/mod.rs | 72 ++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs index bf62c8d..7713b6e 100644 --- a/constensor-core/src/cuda_backend/mod.rs +++ b/constensor-core/src/cuda_backend/mod.rs @@ -24,7 +24,7 @@ use std::{ use crate::{ cpu_storage::CpuStorage, device::Dev, - storage::{BackendDevice, BackendStorage}, + storage::{BackendDevice, BackendStorage, Storage}, CompiledGraph, DType, GraphNode, Op, Result, Shape, }; @@ -100,6 +100,76 @@ impl BackendStorage for CudaStorage { let data = self.device.stream().memcpy_dtov(&self.slice).w()?; Ok(Cow::Owned(CpuStorage(data))) } + fn cast(&self) -> Result> { + let function_name = format!("cast_{}_to_{}", T::NAME, U::NAME); + + let template_kernel = format!( + r#" + typedef unsigned char uint8_t; + typedef unsigned int uint32_t; + typedef long long int int64_t; + {} + {} + + template + __device__ void cast_kernel(T *in, U *out, const size_t numel) {{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; + i += blockDim.x * gridDim.x) {{ + out[i] = static_cast(in[i]); + }} + }} + + extern "C" __global__ void {function_name}({} *in, {} *out, const size_t numel) {{ + {function_name}_kernel(buf, numel); + }} + + "#, + T::C_DEP.unwrap_or(""), + U::C_DEP.unwrap_or(""), + T::C_NAME, + U::C_NAME, + ); + + // Always recompile PTX to avoid using stale cached files + let ptx = compile_ptx(template_kernel.clone())?; + + let ptx_str = ptx.to_src(); + if let Some(home) = dirs::home_dir() { + let path = format!( + "{}/.cache/constensor/ptx/{function_name}.ptx", + home.display() + ); + let path = Path::new(&path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + fs::write(path, ptx_str)?; + } + + let stream = self.device.select_stream(); + let n_elems = self.slice.len(); + + let out = unsafe { stream.alloc::(n_elems) }.w()?; + + let func = self.device.load_func(&function_name, ptx)?; + + let cfg = LaunchConfig::for_num_elems(n_elems as u32); + + let mut builder = stream.launch_builder(&func); + builder.arg(&self.slice); + builder.arg(&out); + unsafe { builder.launch(cfg).w()? }; + + // Record an event once this kernel completes + let event = self.device.context.new_event(None).w()?; + event.record(&stream).w()?; + + Ok(Storage::Cuda(CudaStorage { + slice: out, + device: self.device.clone(), + event, + })) + } } pub enum CudaCompiledKernel { From 811fd1c884b47fc4a50b8fe6d38847f2210c96ec Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 12:54:15 +0000 Subject: [PATCH 4/9] Add the cuda tests --- constensor-core/src/cuda_backend/mod.rs | 7 +- constensor-core/tests/cast.rs | 103 +++++++++++++----------- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs index 7713b6e..41189ce 100644 --- a/constensor-core/src/cuda_backend/mod.rs +++ b/constensor-core/src/cuda_backend/mod.rs @@ -112,7 +112,7 @@ impl BackendStorage for CudaStorage { {} template - __device__ void cast_kernel(T *in, U *out, const size_t numel) {{ + __device__ void {function_name}_kernel(T *in, U *out, const size_t numel) {{ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {{ out[i] = static_cast(in[i]); @@ -120,7 +120,7 @@ impl BackendStorage for CudaStorage { }} extern "C" __global__ void {function_name}({} *in, {} *out, const size_t numel) {{ - {function_name}_kernel(buf, numel); + {function_name}_kernel(in, out, numel); }} "#, @@ -158,6 +158,7 @@ impl BackendStorage for CudaStorage { let mut builder = stream.launch_builder(&func); builder.arg(&self.slice); builder.arg(&out); + builder.arg(&n_elems); unsafe { builder.launch(cfg).w()? }; // Record an event once this kernel completes @@ -320,6 +321,7 @@ fn cuda_include_dir() -> Option { fn compile_ptx(template_kernel: String) -> Result { cudarc::nvrtc::compile_ptx_with_opts( template_kernel, + // Compile PTX without hardcoding an architecture so it can JIT to the current device CompileOptions { use_fast_math: Some(true), include_paths: vec![cuda_include_dir() @@ -327,7 +329,6 @@ fn compile_ptx(template_kernel: String) -> Result { .join("include") .display() .to_string()], - arch: Some("sm_90"), ..Default::default() }, ) diff --git a/constensor-core/tests/cast.rs b/constensor-core/tests/cast.rs index 80752db..e7a6234 100644 --- a/constensor-core/tests/cast.rs +++ b/constensor-core/tests/cast.rs @@ -1,50 +1,63 @@ -use constensor_core::{Graph, GraphTensor, CompiledGraph, Cpu, R1, R2, R3}; +#[cfg(feature = "cuda")] +use constensor_core::Cuda; +use constensor_core::{CompiledGraph, Cpu, Graph, GraphTensor, R1, R2, R3}; -// Test casting a 1D tensor from f32 to f64 -#[test] -fn cast_f32_to_f64_1d() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, f32, Cpu>::fill(&mut graph, 1.5); - let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - let casted = tensor.cast::().unwrap(); - let data = casted.data().unwrap().into_owned(); - assert_eq!(data, vec![1.5_f64; 4]); -} +macro_rules! test_for_device_cast { + ($dev:ty, $name:ident) => { + mod $name { + use super::*; -// Test casting a 2D tensor from f64 to f32 -#[test] -fn cast_f64_to_f32_2d() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, f64, Cpu>::fill(&mut graph, 2.75); - let compiled: CompiledGraph, f64, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - let casted = tensor.cast::().unwrap(); - let data = casted.data().unwrap().into_owned(); - assert_eq!(data, vec![vec![2.75_f32; 3]; 2]); -} + // Test casting a 1D tensor from f32 to f64 + #[test] + fn cast_f32_to_f64_1d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f32, $dev>::fill(&mut graph, 1.5); + let compiled: CompiledGraph, f32, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![1.5_f64; 4]); + } + + // Test casting a 2D tensor from f64 to f32 + #[test] + fn cast_f64_to_f32_2d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f64, $dev>::fill(&mut graph, 2.75); + let compiled: CompiledGraph, f64, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![vec![2.75_f32; 3]; 2]); + } + + // Test casting a 3D tensor from i32 to f32 + #[test] + fn cast_i32_to_f32_3d() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, i32, $dev>::fill(&mut graph, 7); + let compiled: CompiledGraph, i32, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + let expected = vec![vec![vec![7.0_f32; 3]; 2]; 1]; + assert_eq!(data, expected); + } -// Test casting a 3D tensor from i32 to f32 -#[test] -fn cast_i32_to_f32_3d() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, i32, Cpu>::fill(&mut graph, 7); - let compiled: CompiledGraph, i32, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - let casted = tensor.cast::().unwrap(); - let data = casted.data().unwrap().into_owned(); - let expected = vec![vec![vec![7.0_f32; 3]; 2]; 1]; - assert_eq!(data, expected); + // Test casting from f32 to i32 truncates toward zero + #[test] + fn cast_f32_to_i32_truncate() { + let mut graph = Graph::empty(); + let _x = GraphTensor::, f32, $dev>::fill(&mut graph, 1.9); + let compiled: CompiledGraph, f32, $dev> = graph.compile().unwrap(); + let tensor = compiled.run().unwrap(); + let casted = tensor.cast::().unwrap(); + let data = casted.data().unwrap().into_owned(); + assert_eq!(data, vec![1_i32; 3]); + } + } + }; } -// Test casting from f32 to i32 truncates toward zero -#[test] -fn cast_f32_to_i32_truncate() { - let mut graph = Graph::empty(); - let _x = GraphTensor::, f32, Cpu>::fill(&mut graph, 1.9); - let compiled: CompiledGraph, f32, Cpu> = graph.compile().unwrap(); - let tensor = compiled.run().unwrap(); - let casted = tensor.cast::().unwrap(); - let data = casted.data().unwrap().into_owned(); - assert_eq!(data, vec![1_i32; 3]); -} \ No newline at end of file +test_for_device_cast!(Cpu, cpu_tests_cast); +test_for_device_cast!(Cuda<0>, cuda_tests_cast); From addece412fdf8fdd0ab78f8581446077263644c8 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 13:02:57 +0000 Subject: [PATCH 5/9] Kernel caching --- constensor-core/src/cpu_storage/mod.rs | 2 +- constensor-core/src/cuda_backend/mod.rs | 40 ++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/constensor-core/src/cpu_storage/mod.rs b/constensor-core/src/cpu_storage/mod.rs index ef03b9f..d94e9ce 100644 --- a/constensor-core/src/cpu_storage/mod.rs +++ b/constensor-core/src/cpu_storage/mod.rs @@ -27,7 +27,7 @@ pub struct CpuStorage(pub(crate) Vec); impl BackendStorage for CpuStorage { fn to_cpu_storage(&self) -> Result>> { - Ok(Cow::Borrowed(&self)) + Ok(Cow::Borrowed(self)) } fn cast(&self) -> Result> { let new = self.0.iter().map(|x| U::from_f64(x.to_f64())); diff --git a/constensor-core/src/cuda_backend/mod.rs b/constensor-core/src/cuda_backend/mod.rs index 41189ce..cae4205 100644 --- a/constensor-core/src/cuda_backend/mod.rs +++ b/constensor-core/src/cuda_backend/mod.rs @@ -13,7 +13,7 @@ use std::sync::{ }; use std::{ borrow::Cow, - collections::{HashMap, HashSet}, + collections::{HashMap, HashSet, VecDeque}, fs, hash::{DefaultHasher, Hash, Hasher}, marker::PhantomData, @@ -38,11 +38,14 @@ unsafe impl Send for CudaRng {} pub struct CudaDevice { context: Arc, stream: Arc, - modules: Arc>>>, + modules: Arc>>>, + module_cache_order: Arc>>, streams: Arc>>, stream_index: Arc, } +const MAX_CACHED_KERNELS: usize = 128; + impl CudaDevice { pub(crate) fn new(ordinal: usize) -> Result { let context = cudarc::driver::CudaContext::new(ordinal).w()?; @@ -57,7 +60,8 @@ impl CudaDevice { Ok(Self { context, stream, - modules: Arc::new(RwLock::new(vec![])), + modules: Arc::new(RwLock::new(HashMap::new())), + module_cache_order: Arc::new(Mutex::new(VecDeque::new())), streams, stream_index, }) @@ -74,9 +78,29 @@ impl CudaDevice { } pub(crate) fn load_func(&self, function_name: &str, ptx: Ptx) -> Result { + // If we've already loaded this kernel, skip reloading + { + let modules_read = self.modules.read().unwrap(); + if let Some(module) = modules_read.get(function_name) { + return module.load_function(function_name).w(); + } + } + + // Otherwise compile and load let module = self.context.load_module(ptx).w()?; let func = module.load_function(function_name).w()?; - self.modules.write().unwrap().push(module); + // Insert into cache and cap size + { + let mut modules_write = self.modules.write().unwrap(); + let mut order = self.module_cache_order.lock().unwrap(); + modules_write.insert(function_name.to_string(), module.clone()); + order.push_back(function_name.to_string()); + if order.len() > MAX_CACHED_KERNELS { + if let Some(old) = order.pop_front() { + modules_write.remove(&old); + } + } + } Ok(func) } } @@ -375,6 +399,14 @@ impl CudaDevice { header.hash(&mut hasher); let function_name = format!("jit_kernel_{}_{}", hasher.finish(), T::NAME); + // If we've already compiled this kernel, skip PTX compilation + if let Some(module) = self.modules.read().unwrap().get(&function_name) { + let func = module.load_function(&function_name).w()?; + let n_elems: usize = shape.iter().product(); + let data = unsafe { self.stream.alloc::(n_elems) }.w()?; + return Ok((func, data)); + } + let template_kernel = format!( r#" typedef unsigned char uint8_t; From d96aabbb402355bfa9f1b33a832dcf7549f6919e Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 13:04:21 +0000 Subject: [PATCH 6/9] Fix --- constensor-core/tests/cast.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/constensor-core/tests/cast.rs b/constensor-core/tests/cast.rs index e7a6234..b17a3fd 100644 --- a/constensor-core/tests/cast.rs +++ b/constensor-core/tests/cast.rs @@ -60,4 +60,5 @@ macro_rules! test_for_device_cast { } test_for_device_cast!(Cpu, cpu_tests_cast); +#[cfg(feature = "cuda")] test_for_device_cast!(Cuda<0>, cuda_tests_cast); From 2cc4b0a2863077590b63fdf4cc63220e8c34e70b Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 13:05:24 +0000 Subject: [PATCH 7/9] Tweak target features --- .cargo/config.toml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index c7a111c..4abaaa8 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,15 +1,9 @@ [target.x86_64-unknown-linux-gnu] -rustflags = [ - "-C", "target-cpu=native", - "-C", "target-feature=+fp16" -] +rustflags = ["-C", "target-cpu=native"] [target.aarch64-apple-darwin] [build] -rustflags = [ - "-C", "target-cpu=native", - "-C", "target-feature=+fp16" -] +rustflags = ["-C", "target-cpu=native"] [target.wasm32-unknown-unknown] rustflags = ["-C", "target-feature=+simd128"] From 6437700308e821519627c7db5ed87aadbb7a680c Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 13:07:35 +0000 Subject: [PATCH 8/9] Tweak target features --- .cargo/config.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 4abaaa8..33c100b 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -3,7 +3,10 @@ rustflags = ["-C", "target-cpu=native"] [target.aarch64-apple-darwin] [build] -rustflags = ["-C", "target-cpu=native"] +rustflags = [ + "-C", "target-cpu=native", + "-C", "target-cpu=+f16" +] [target.wasm32-unknown-unknown] rustflags = ["-C", "target-feature=+simd128"] From fe731e6883d931790dac1da0334ce13a5ebb068f Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Fri, 25 Apr 2025 13:09:01 +0000 Subject: [PATCH 9/9] Tweak target features --- .cargo/config.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 33c100b..c7a111c 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,11 +1,14 @@ [target.x86_64-unknown-linux-gnu] -rustflags = ["-C", "target-cpu=native"] +rustflags = [ + "-C", "target-cpu=native", + "-C", "target-feature=+fp16" +] [target.aarch64-apple-darwin] [build] rustflags = [ "-C", "target-cpu=native", - "-C", "target-cpu=+f16" + "-C", "target-feature=+fp16" ] [target.wasm32-unknown-unknown]