Skip to content

Commit

Permalink
feat(build): add nix build definitions
Browse files Browse the repository at this point in the history
Add a nix build matrix and github ci actions that use them. On a
powerful machine, this can build and run unit tests for ~700 build
configuration combinations in about 5 minutes.
  • Loading branch information
aws-nslick committed Sep 22, 2024
1 parent e43640f commit a473699
Show file tree
Hide file tree
Showing 12 changed files with 705 additions and 1 deletion.
12 changes: 12 additions & 0 deletions .devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"customizations": {
"vscode": {
"extensions": [
"mkhl.direnv"
]
}
},
"image": "ghcr.io/cachix/devenv:latest",
"overrideCommand": false,
"updateContentCommand": "devenv test"
}
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
use flake . --impure --show-trace
27 changes: 27 additions & 0 deletions .github/workflows/nix-cache-build-deps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Cache Nix CI Dependencies

on:
workflow_dispatch:
pull-request:
paths: ['.nix/*', 'flake.nix', 'flake.lock']
push:
branches: ['master', 'v*']
paths: ['.nix/*', 'flake.nix', 'flake.lock']

jobs:
build-and-cache-dependencies:
name: Build and Cache Dependencies
steps:
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@main
with:
determinate: true
extra-conf: |
experimental-features = nix-command flakes auto-allocate-uids
extra-substituters = https://nix-community.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@main
- name: Build and Cache Dependencies
run: |
export NIXPKGS_ALLOW_UNFREE=1
nix build --impure -L '.#checks.x86_64-linux.deps'
40 changes: 40 additions & 0 deletions .github/workflows/nix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Quick CI Builds

on:
workflow_dispatch:
pull_request:
paths:
- "configure.ac"
- "Makefile.am"
- "autogen.sh"
- "include/**"
- "m4/**"
- "src/**"
- "tests/**"
- ".github/workflows/nix*.yaml"
- "flake.nix"
- "flake.lock"
- ".nix/**"

jobs:
nix-build:
name: nix build smoke test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@main
with:
determinate: true
extra-conf: |
experimental-features = nix-command flakes auto-allocate-uids
extra-substituters = https://nix-community.cachix.org https://cuda-maintainers.cachix.org
extra-trusted-public-keys = nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- uses: DeterminateSystems/magic-nix-cache-action@main
- name: Build and Cache Dependencies
run: |
export NIXPKGS_ALLOW_UNFREE=1
nix build --impure -L '.#checks.x86_64-linux.default"
# We want to use the cache here, but we don't want to cache this
# specific build, so purge the cache now to prevent it from being
# pushed on cleanup.
nix-collect-garbage -d
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,10 @@ m4/ltversion.m4
m4/lt~obsolete.m4

.idea/
.devenv/
*.src.rpm
dockerbld
.devenv*
devenv.local.nix
.direnv
.pre-commit-config.yaml
result
10 changes: 10 additions & 0 deletions .nix/combos.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
accelerator = [ "-cuda" "-neuron" ];
platform = [ "-aws" "" ];
tracing = [ [ "" ] [ "-nvtx" ] [ "-lttng" ] [ "-nvtx" "-lttng" ] ];
debug = [ "-debug" "" ];
memory = [ "-valgrind" "" ];
traceprints = [ "-trace" "" ];
cpp = [ "-cpp" "" ];
stdenv = [ (pkgs: pkgs.gcc7Stdenv) (pkgs: pkgs.clangStdenv) (pkgs: pkgs.gcc14Stdenv) ];
}
143 changes: 143 additions & 0 deletions .nix/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
{ lib
, fetchFromGitHub
, symlinkJoin
, gitUpdater
, stdenv
, config
, libfabric
, hwloc
, autoreconfHook
, lttng-ust
, valgrind
, mpi
, cudaPackages ? { }
, enableTests ? true
, enableTracePrints ? (enableTests)
, neuronSupport ? (!config.cudaSupport)
, cudaSupport ? (config.cudaSupport && !neuronSupport)
, enableLTTNGTracing ? false
, enableNVTXTracing ? false
, enableValgrind ? false
, enableAwsTuning ? false
, enableCPPMode ? false
}:

assert neuronSupport != cudaSupport;
#assert !enableNVTXTracing || (enableNVTXTracing && !neuronSupport);

let
basename = "lib${if neuronSupport then "nccom" else "nccl"}-net-ofi";
pname = "${basename}${if enableAwsTuning == true then "-aws" else ""}";
version = "1.11.0";
src = fetchFromGitHub {
owner = "aws";
repo = "aws-ofi-nccl";
rev = "v${version}-aws";
sha256 = "sha256-y3yVPqak+36UXI6L/ddQIfBBwpeiciW571noc8LNefU=";
};
cuda_build_deps_joined = symlinkJoin {
name = "cuda-build-deps-joined";
paths = lib.optionals (cudaSupport) [
(lib.getOutput "static" cudaPackages.cuda_cudart)
(lib.getDev cudaPackages.cuda_cudart)
(lib.getDev cudaPackages.cuda_nvcc)
];
};
in
stdenv.mkDerivation {
inherit pname version src;

enableParallelBuilding = true;
separateDebugInfo = true;
strictDeps = true;

nativeBuildInputs = [ autoreconfHook ];
configureFlags =
[
"--enable-picky-compiler"
"--enable-werror"
"--with-hwloc=${lib.getDev hwloc}"
"--with-libfabric=${lib.getDev libfabric}"
]
++ lib.optionals enableCPPMode [
"--enable-cpp=yes"
]
++ lib.optionals (!enableTests) [
"--disable-tests"
]
++ lib.optionals enableTests [
"--enable-tests"
"--with-mpi=${lib.getDev mpi}"
]
++ lib.optionals enableTracePrints [
"--enable-trace"
]
++ lib.optionals cudaSupport [
"--with-cuda=${cuda_build_deps_joined}"
]
++ lib.optionals enableLTTNGTracing [
"--with-lttng=${lib.getDev lttng-ust}"
]
++ lib.optionals enableValgrind [
"--with-valgrind=${lib.getDev valgrind}"
]
++ lib.optionals (enableNVTXTracing && cudaSupport) [
"--with-nvtx=${lib.getDev cudaPackages.cuda_nvtx}"
]
++ lib.optionals enableAwsTuning [
"--enable-platform-aws"
]
++ lib.optionals neuronSupport [
"--enable-neuron"
];

buildInputs =
[
libfabric
hwloc
]
++ lib.optionals cudaSupport [
cuda_build_deps_joined
]
++ lib.optionals enableValgrind [
valgrind
]
++ lib.optionals enableTests [
mpi
]
++ lib.optionals enableLTTNGTracing [
lttng-ust
];
postInstall = ''find $out/lib | grep -E \.la$ | xargs rm'';

doCheck = enableTests;
checkPhase = ''
set -euo pipefail
for test in $(find tests/unit/ -type f -executable -print | xargs) ; do
echo "======================================================================"
echo "Running $test"
./$test
test $? -eq 0 && (echo "✅ Passed" || (echo "❌ Failed!" && exit 1))
done
echo "All unit tests passed successfully."
set +u
'';

passthru = {
inherit cudaSupport;
updateScript = gitUpdater {
inherit pname version;
rev-prefix = "v";
};
};
meta = with lib; {
homepage = "https://github.com/aws/aws-ofi-nccl";
license = licenses.asl20;
broken = (cudaSupport && !config.cudaSupport);
maintainers = with maintainers; [ sielicki ];
platforms = [
"x86_64-linux"
"aarch64-linux"
];
};
}
25 changes: 25 additions & 0 deletions .nix/lib.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{ inputs, lib }:
rec {
mkGitVersion = i: "git${if (i ? rev) then (builtins.substring 0 7 "${i.rev}") else "dirty" + (builtins.substring 0 7 "${i.dirtyRev}")}";
compilerName = s: "-${if s.cc.isGNU then "gcc" else "clang"}${s.cc.version}";
genComboName = pkgs: combo: prevname: "${prevname}${(lib.strings.concatStrings combo.tracing)}${combo.debug}${combo.memory}${combo.traceprints}${combo.cpp}${compilerName (combo.stdenv pkgs)}";
genPkgFromCombo = pkgs: combo: (pkgs.callPackage ./default.nix {
stdenv = (combo.stdenv pkgs);
cudaSupport = (combo.accelerator == "-cuda");
neuronSupport = (combo.accelerator == "-neuron");
enableAwsTuning = (combo.platform == "-aws");
enableNVTXTracing = (combo.accelerator == "-cuda" && (builtins.elem "-nvtx" combo.tracing));
enableLTTNGTracing = (builtins.elem "-lttng" combo.tracing);
enableValgrind = (combo.memory == "-valgrind");
enableTracePrints = (combo.traceprints == "-trace");
enableCPPMode = (combo.cpp == "-cpp");
}).overrideAttrs (pprev: {
src = inputs.self;
version = mkGitVersion inputs.self;
});

genAttrsFromCombo = { pkgs }: combo: let
value = (genPkgFromCombo pkgs combo);
name = (genComboName pkgs combo value.pname);
in { inherit name; value = value.overrideAttrs { inherit name; }; };
}
96 changes: 96 additions & 0 deletions .nix/overlay.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{ inputs }:
(final: prev:
let
lib = prev.lib // (import ./lib.nix { inherit inputs; lib = prev.lib; });
in
{

libgdrcopy = final.cudaPackages.backendStdenv.mkDerivation rec {
pname = "libgdrcopy";
src = inputs.gdrcopy;
version = lib.mkGitVersion inputs.gdrcopy;
makeFlags = [
"LIB_MAJOR_VER=2"
"LIB_MINOR_VER=5"
"DESTLIB=$out/lib"
"DESTINC=$out/include"
"GDRAPI_ARCH=X86"
];
patchPhase = "chmod +x config_arch";
buildPhase = "make -C src all";
depsTargetTarget = with final.cudaPackages; [ cuda_cudart ];
installPhase = "mkdir -p $out/lib && make ${lib.strings.concatStringsSep " " makeFlags} lib_install";
};

rdma-core = prev.rdma-core.overrideAttrs (pprev: {
src = inputs.rdma-core;
version = lib.mkGitVersion inputs.rdma-core;
});

hwloc = prev.hwloc.overrideAttrs (pprev: {
src = inputs.hwloc;
version = lib.mkGitVersion inputs.hwloc;
nativeBuildInputs = (pprev.nativeBuildInputs or [ ]) ++ [ prev.autoreconfHook ];
});

# pmix/prrte/openmpi cannot support new hwloc
pmix = prev.pmix.override { hwloc = final.hwloc; };
prrte = prev.prrte.override { hwloc = final.hwloc; };
openmpi = (prev.openmpi.override {
cudaSupport = true;
libfabric = final.libfabric;
rdma-core = final.rdma-core;
hwloc = final.hwloc;
}).overrideAttrs (pprev: {
src = inputs.openmpi;
version = lib.mkGitVersion inputs.openmpi;
nativeBuildInputs = (pprev.nativeBuildInputs or [ ]) ++ [
prev.autoconf
prev.automake
prev.libtool
prev.perl
prev.git
prev.flex
];
prePatch = ''
patchShebangs .
./autogen.pl
'';
outputs = final.lib.lists.remove "man" pprev.outputs;
NIX_CFLAGS_COMPILE = "-Wno-deprecated-declarations";
});

libfabric = (prev.libfabric.override {
enableOpx = false;
enablePsm2 = false;
}).overrideAttrs (pprev: {
src = inputs.libfabric;
version = lib.mkGitVersion inputs.libfabric;
configureFlags = (prev.configureFlags or [ ]) ++ [
"--enable-efa=yes"
"--with-cuda=${prev.lib.getDev final.cudaPackages.cudatoolkit}"
"--enable-cuda-dlopen"
"--with-gdrcopy=${prev.lib.getDev final.libgdrcopy}"
"--enable-gdrcopy-dlopen"
];
buildInputs = (pprev.buildInputs or [ ]) ++ [
final.rdma-core
];
});

cudaPackages = prev.cudaPackages.overrideScope (ffinal: pprev: rec {
nccl = pprev.nccl.overrideAttrs {
src = inputs.nccl;
version = lib.mkGitVersion inputs.nccl;
};
nccl-tests = (pprev.nccl-tests.overrideAttrs {
src = inputs.nccl-tests;
version = lib.mkGitVersion inputs.nccl-tests;
}).override {
mpiSupport = true;
mpi = final.openmpi;
cudaPackages = pprev.cudaPackages // { inherit nccl; };
config.cudaSupport = true;
};
});
})
Loading

0 comments on commit a473699

Please sign in to comment.