Skip to content

Commit 100a02b

Browse files
committed
ROCm WIP
1 parent 62a015d commit 100a02b

File tree

12 files changed

+728
-13
lines changed

12 files changed

+728
-13
lines changed

Cargo.lock

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Dockerfile-farmer

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,39 @@ RUN \
4747
curl -OL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/$CUDA_ARCH/cuda-ubuntu2004.pin && \
4848
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
4949
apt-get update && \
50-
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4
50+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cuda-minimal-build-12-4 && \
51+
echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \
52+
ldconfig
5153

54+
# ROCm is only used on x86-64 since they don't have other packages
55+
ARG ROCM_VERSION=6.2
5256
RUN \
53-
export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} && \
54-
export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} && \
57+
if [ $(uname -p) = "x86_64" ]; then \
58+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gpg && \
59+
mkdir -p --mode=0755 /etc/apt/keyrings && \
60+
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
61+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
62+
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
63+
apt-get update && \
64+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-hip-runtime-dev && \
65+
echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf && \
66+
ldconfig \
67+
; fi
68+
69+
# TODO: Remove `NVCC=off` hack once `sppark` has proper features for CUDA and ROCm
70+
# ROCm is only used on x86-64 since they don't have other packages
71+
RUN \
72+
export PATH=/usr/local/cuda/bin:/opt/rocm-$ROCM_VERSION/bin${PATH:+:${PATH}} && \
73+
if [ $(uname -p) = "x86_64" ]; then \
74+
NVCC=off /root/.cargo/bin/cargo -Zgitoxide -Zgit build \
75+
--locked \
76+
-Z build-std \
77+
--profile $PROFILE \
78+
--bin subspace-farmer \
79+
--features rocm \
80+
--target $(uname -p)-unknown-linux-gnu && \
81+
mv target/*/*/subspace-farmer subspace-farmer-rocm \
82+
; fi && \
5583
/root/.cargo/bin/cargo -Zgitoxide -Zgit build \
5684
--locked \
5785
-Z build-std \
@@ -64,7 +92,26 @@ RUN \
6492

6593
FROM ubuntu:20.04
6694

67-
COPY --from=0 /code/subspace-farmer /subspace-farmer
95+
# Next block is for ROCm support
96+
# ROCm is only used on x86-64 since they don't have other packages
97+
ARG ROCM_VERSION=6.2
98+
RUN \
99+
if [ $(uname -p) = "x86_64" ]; then \
100+
apt-get update && \
101+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends curl ca-certificates gpg && \
102+
mkdir -p --mode=0755 /etc/apt/keyrings && \
103+
curl -L https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
104+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/$ROCM_VERSION focal main" > /etc/apt/sources.list.d/rocm.list && \
105+
echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 && \
106+
apt-get update && \
107+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends hip-runtime-amd && \
108+
DEBIAN_FRONTEND=noninteractive apt-get remove -y --purge --autoremove curl ca-certificates gpg && \
109+
rm -rf /var/lib/apt/lists/* && \
110+
echo "/opt/rocm/lib" >> /etc/ld.so.conf.d/rocm.conf && \
111+
ldconfig \
112+
; fi
113+
114+
COPY --from=0 /code/subspace-farmer* /
68115

69116
RUN mkdir /var/subspace && chown nobody:nogroup /var/subspace
70117

crates/subspace-farmer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ cluster = ["dep:async-nats"]
7878
numa = ["dep:hwlocality"]
7979
# Only Volta+ architectures are supported (GeForce RTX 20xx consumer GPUs and newer)
8080
cuda = ["_gpu", "subspace-proof-of-space-gpu/cuda"]
81+
rocm = ["_gpu", "subspace-proof-of-space-gpu/rocm"]
8182
# Internal feature, shouldn't be used directly
8283
_gpu = []
8384

crates/subspace-farmer/src/bin/subspace-farmer/commands/cluster/plotter.rs

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ use subspace_farmer::cluster::plotter::plotter_service;
1818
use subspace_farmer::plotter::cpu::CpuPlotter;
1919
#[cfg(feature = "cuda")]
2020
use subspace_farmer::plotter::gpu::cuda::CudaRecordsEncoder;
21+
#[cfg(feature = "rocm")]
22+
use subspace_farmer::plotter::gpu::rocm::RocmRecordsEncoder;
2123
#[cfg(feature = "_gpu")]
2224
use subspace_farmer::plotter::gpu::GpuPlotter;
2325
use subspace_farmer::plotter::pool::PoolPlotter;
@@ -101,6 +103,24 @@ struct CudaPlottingOptions {
101103
cuda_gpus: Option<String>,
102104
}
103105

106+
#[cfg(feature = "rocm")]
107+
#[derive(Debug, Parser)]
108+
struct RocmPlottingOptions {
109+
/// Defines how many sectors farmer will download concurrently during plotting with ROCm GPU,
110+
/// allows to limit memory usage of the plotting process, defaults to number of ROCm GPUs found
111+
/// + 1 to download future sector ahead of time.
112+
///
113+
/// Increase will result in higher memory usage.
114+
#[arg(long)]
115+
rocm_sector_downloading_concurrency: Option<NonZeroUsize>,
116+
/// Specify exact GPUs to be used for plotting instead of using all GPUs (default behavior).
117+
///
118+
/// GPUs are coma-separated: `--rocm-gpus 0,1,3`. Empty string can be specified to disable ROCm
119+
/// GPU usage.
120+
#[arg(long)]
121+
rocm_gpus: Option<String>,
122+
}
123+
104124
/// Arguments for plotter
105125
#[derive(Debug, Parser)]
106126
pub(super) struct PlotterArgs {
@@ -118,6 +138,10 @@ pub(super) struct PlotterArgs {
118138
#[cfg(feature = "cuda")]
119139
#[clap(flatten)]
120140
cuda_plotting_options: CudaPlottingOptions,
141+
/// Plotting options only used by ROCm GPU plotter
142+
#[cfg(feature = "rocm")]
143+
#[clap(flatten)]
144+
rocm_plotting_options: RocmPlottingOptions,
121145
/// Additional cluster components
122146
#[clap(raw = true)]
123147
pub(super) additional_components: Vec<String>,
@@ -137,6 +161,8 @@ where
137161
cpu_plotting_options,
138162
#[cfg(feature = "cuda")]
139163
cuda_plotting_options,
164+
#[cfg(feature = "rocm")]
165+
rocm_plotting_options,
140166
additional_components: _,
141167
} = plotter_args;
142168

@@ -168,6 +194,21 @@ where
168194
modern_plotters.push(Box::new(cuda_plotter));
169195
}
170196
}
197+
#[cfg(feature = "rocm")]
198+
{
199+
let maybe_rocm_plotter = init_rocm_plotter(
200+
rocm_plotting_options,
201+
piece_getter.clone(),
202+
Arc::clone(&global_mutex),
203+
kzg.clone(),
204+
erasure_coding.clone(),
205+
registry,
206+
)?;
207+
208+
if let Some(rocm_plotter) = maybe_rocm_plotter {
209+
modern_plotters.push(Box::new(rocm_plotter));
210+
}
211+
}
171212
{
172213
let cpu_sector_encoding_concurrency = cpu_plotting_options.cpu_sector_encoding_concurrency;
173214
let maybe_cpu_plotters = init_cpu_plotters::<_, PosTableLegacy, PosTable>(
@@ -401,3 +442,85 @@ where
401442
.map_err(|error| anyhow::anyhow!("Failed to initialize CUDA plotter: {error}"))?,
402443
))
403444
}
445+
446+
#[cfg(feature = "rocm")]
447+
fn init_rocm_plotter<PG>(
448+
rocm_plotting_options: RocmPlottingOptions,
449+
piece_getter: PG,
450+
global_mutex: Arc<AsyncMutex<()>>,
451+
kzg: Kzg,
452+
erasure_coding: ErasureCoding,
453+
registry: &mut Registry,
454+
) -> anyhow::Result<Option<GpuPlotter<PG, RocmRecordsEncoder>>>
455+
where
456+
PG: PieceGetter + Clone + Send + Sync + 'static,
457+
{
458+
use std::collections::BTreeSet;
459+
use subspace_proof_of_space_gpu::rocm::rocm_devices;
460+
use tracing::{debug, warn};
461+
462+
let RocmPlottingOptions {
463+
rocm_sector_downloading_concurrency,
464+
rocm_gpus,
465+
} = rocm_plotting_options;
466+
467+
let mut rocm_devices = rocm_devices();
468+
let mut used_rocm_devices = (0..rocm_devices.len()).collect::<Vec<_>>();
469+
470+
if let Some(rocm_gpus) = rocm_gpus {
471+
if rocm_gpus.is_empty() {
472+
info!("ROCm GPU plotting was explicitly disabled");
473+
return Ok(None);
474+
}
475+
476+
let mut rocm_gpus_to_use = rocm_gpus
477+
.split(',')
478+
.map(|gpu_index| gpu_index.parse())
479+
.collect::<Result<BTreeSet<usize>, _>>()?;
480+
481+
(used_rocm_devices, rocm_devices) = rocm_devices
482+
.into_iter()
483+
.enumerate()
484+
.filter(|(index, _rocm_device)| rocm_gpus_to_use.remove(index))
485+
.unzip();
486+
487+
if !rocm_gpus_to_use.is_empty() {
488+
warn!(
489+
?rocm_gpus_to_use,
490+
"Some ROCm GPUs were not found on the system"
491+
);
492+
}
493+
}
494+
495+
if rocm_devices.is_empty() {
496+
debug!("No ROCm GPU devices found");
497+
return Ok(None);
498+
}
499+
500+
info!(?used_rocm_devices, "Using ROCm GPUs");
501+
502+
let rocm_downloading_semaphore = Arc::new(Semaphore::new(
503+
rocm_sector_downloading_concurrency
504+
.map(|rocm_sector_downloading_concurrency| rocm_sector_downloading_concurrency.get())
505+
.unwrap_or(rocm_devices.len() + 1),
506+
));
507+
508+
Ok(Some(
509+
GpuPlotter::new(
510+
piece_getter,
511+
rocm_downloading_semaphore,
512+
rocm_devices
513+
.into_iter()
514+
.map(|rocm_device| RocmRecordsEncoder::new(rocm_device, Arc::clone(&global_mutex)))
515+
.collect::<Result<_, _>>()
516+
.map_err(|error| {
517+
anyhow::anyhow!("Failed to create ROCm records encoder: {error}")
518+
})?,
519+
global_mutex,
520+
kzg,
521+
erasure_coding,
522+
Some(registry),
523+
)
524+
.map_err(|error| anyhow::anyhow!("Failed to initialize ROCm plotter: {error}"))?,
525+
))
526+
}

0 commit comments

Comments
 (0)