Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix things #14

Merged
merged 10 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .envrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ if ! has nix_direnv_version || ! nix_direnv_version 2.3.0; then
source_url "https://raw.githubusercontent.com/nix-community/nix-direnv/2.3.0/direnvrc" "sha256-Dmd+j63L84wuzgyjITIfSxSD57Tx7v51DMxVZOsiUD8="
fi

# Environment variables are cached by direnv, so we don't need Nix's eval-cache.
# Disable Nix's eval-cache so that we can always see error messages if any.
use flake --no-eval-cache --show-trace
use flake . '--no-eval-cache' '--show-trace'
3 changes: 2 additions & 1 deletion examples/devenv-python/flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
inputs.nix-ml-ops.flakeModules.devcontainer
];
perSystem = { pkgs, ... }: {
ml-ops.common.pythonPackage.base-package = pkgs.python310;

ml-ops.devcontainer = {
pythonPackage.base-package = pkgs.python310;
devenvShellModule.languages.python = {
enable = true;
venv.enable = true;
Expand Down
81 changes: 55 additions & 26 deletions examples/poetry-docker-job-azure/flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,36 +10,65 @@
};
inputs = unlockedInputs // lockedInputs;
in
inputs.nix-ml-ops.lib.mkFlake
{
inherit inputs;
}
{
debug = true;
inputs.nix-ml-ops.lib.mkFlake { inherit inputs; } {
debug = true;

imports = [
inputs.nix-ml-ops.flakeModules.cuda
inputs.nix-ml-ops.flakeModules.devcontainer
inputs.nix-ml-ops.flakeModules.nixIde
inputs.nix-ml-ops.flakeModules.nixLd
inputs.nix-ml-ops.flakeModules.pythonEnvsPoetry
inputs.nix-ml-ops.flakeModules.kubernetesJob
inputs.nix-ml-ops.flakeModules.aksCredential
inputs.nix-ml-ops.flakeModules.devcontainerAzureCliTools
];
imports = [
inputs.nix-ml-ops.flakeModules.cuda
inputs.nix-ml-ops.flakeModules.devcontainer
inputs.nix-ml-ops.flakeModules.nixIde
inputs.nix-ml-ops.flakeModules.nixLd
inputs.nix-ml-ops.flakeModules.pythonEnvsPoetry
inputs.nix-ml-ops.flakeModules.kubernetesJob
inputs.nix-ml-ops.flakeModules.aksCredential
inputs.nix-ml-ops.flakeModules.devcontainerAzureCliTools
inputs.nix-ml-ops.flakeModules.devenvPythonWithLibstdcxx
];

perSystem = { pkgs, config, lib, ... }: {
ml-ops.jobs.my-job.launchers.my-launcher.kubernetes = {
aks = { };
imageRegistry = { };
helmTemplates.job.spec.template.spec.containers.master-node.args = [
"python"
"--version"
perSystem = perSystem@{ pkgs, inputs', ... }: {
ml-ops.common.pythonPackage.base-package = pkgs.python310;
ml-ops.jobs.my-job.launchers.my-launcher.kubernetes = {
aks = { };
imageRegistry = { };
helmTemplates.job.spec.template.spec = {
tolerations = [
{
key = "sku";
operator = "Equal";
value = "gpu";
effect = "NoSchedule";
}
{
key = "kubernetes.azure.com/scalesetpriority";
operator = "Equal";
value = "spot";
effect = "NoSchedule";
}
];
containers.master-node = {
resources.limits."nvidia.com/gpu" = 1;
args = [
"python"
"-c"
''
import sys
import torch
print("sys.version =", sys.version)
print("torch.cuda.is_available() = ", torch.cuda.is_available())
''
];
};
};
ml-ops.devcontainer.devenvShellModule.packages = [
pkgs.kubectl
];
};

# Set maxLayers to a large number to reuse layers from previous builds.
# Docker image layers limit is 127 by default, while containerd does not have such limit.
# See https://grahamc.com/blog/nix-and-layered-docker-images/
ml-ops.common.devenvShellModule.containers.processes.maxLayers = 1000;

ml-ops.devcontainer.devenvShellModule.packages = [
pkgs.kubectl
];
};
};
}
857 changes: 224 additions & 633 deletions examples/poetry-docker-job-azure/poetry.lock

Large diffs are not rendered by default.

Empty file.
15 changes: 11 additions & 4 deletions examples/poetry-docker-job-azure/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
[tool.poetry]
name = "poetry-docker-job"
name = "poetry-docker-job-azure"
version = "0.1.0"
description = ""
authors = ["Yang, Bo <bo@preemo.io>"]
readme = "README.md"
packages = [{include = "poetry_docker_job"}]

[[tool.poetry.source]]
name = "cuda-torch"
url = "https://download.pytorch.org/whl/cu118"
# Don't set the priority to "explicit", because `custom-kernels/pyproject.toml`'s build-system cannot find packages from a explicit source.
priority = "supplemental"

[tool.poetry.dependencies]
python = "^3.10"
transformers = "^4.28.1"

torch = [
{ platform = "darwin", version = "2.0.1", source = "pypi" },
{ platform = "linux", version = "2.0.1+cu118", source = "cuda-torch" },
]

[build-system]
requires = ["poetry-core"]
Expand Down
67 changes: 36 additions & 31 deletions flake-modules/aks-credential.nix
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,29 @@ topLevel@{ flake-parts-lib, inputs, lib, ... }: {
(kubernetes:
let
authModule = {
overrideAttrs =
pipe =
lib.mkIf (kubernetes.config.aks != null)
(lib.mkDerivedConfig kubernetes.options.aks (aks:
[
(old: {
aksCluster = kubernetes.config.aks.cluster;
aksResourceGroup = kubernetes.config.aks.resourcegroup;
buildCommand = ''
# TODO(bo@preemo.io, 11/09/2023): Supports credentials other than managed identity. See https://nixos.wiki/wiki/Comparison_of_secret_managing_schemes
az login --identity
az aks get-credentials \
--name "$aksCluster" \
--resource-group "$aksResourceGroup"
(previousPackage: previousPackage.overrideAttrs
(previousAttrs: {
aksCluster = kubernetes.config.aks.cluster;
aksResourceGroup = kubernetes.config.aks.resourcegroup;
buildCommand = ''
# TODO(bo@preemo.io, 11/09/2023): Supports credentials other than managed identity. See https://nixos.wiki/wiki/Comparison_of_secret_managing_schemes
az login --identity
az aks get-credentials \
--name "$aksCluster" \
--resource-group "$aksResourceGroup"

${old.buildCommand}
'';
nativeBuildInputs = old.nativeBuildInputs ++ [
# pkgs.cacert
pkgs.azure-cli
];
})
${previousAttrs.buildCommand}
'';
nativeBuildInputs = previousAttrs.nativeBuildInputs ++ [
# pkgs.cacert
pkgs.azure-cli
];
})
)
]
));
};
Expand Down Expand Up @@ -79,26 +81,29 @@ topLevel@{ flake-parts-lib, inputs, lib, ... }: {
)
);

config.pushImage.overrideAttrs =
config.pushImage.pipe =
lib.mkIf (kubernetes.config.aks != null)
(lib.mkDerivedConfig
kubernetes.options.aks
(aks: [
(old: {
aksRegistryName = aks.registryName;
(previousPackage: previousPackage.overrideAttrs
(previousAttrs: {
aksRegistryName = aks.registryName;

buildCommand = ''
# TODO(bo@preemo.io, 11/09/2023): Supports credentials other than managed identity. See https://nixos.wiki/wiki/Comparison_of_secret_managing_schemes
az login --identity
buildCommand = ''
# TODO(bo@preemo.io, 11/09/2023): Supports credentials other than managed identity. See https://nixos.wiki/wiki/Comparison_of_secret_managing_schemes
az login --identity

export skopeoCopyArgs="$(printf "%q " --dest-username 00000000-0000-0000-0000-000000000000 --dest-password "$(az acr login --name "$aksRegistryName" --expose-token --output tsv --query accessToken)")"
${old.buildCommand}
'';
nativeBuildInputs = old.nativeBuildInputs ++ [
# pkgs.cacert
pkgs.azure-cli
];
})
export skopeoCopyArgs="$(printf "%q " --dest-username 00000000-0000-0000-0000-000000000000 --dest-password "$(az acr login --name "$aksRegistryName" --expose-token --output tsv --query accessToken)")"

${previousAttrs.buildCommand}
'';
nativeBuildInputs = previousAttrs.nativeBuildInputs ++ [
# pkgs.cacert
pkgs.azure-cli
];
})
)
]));

config.helmUpgrade.imports = [
Expand Down
26 changes: 14 additions & 12 deletions flake-modules/devenv-python-with-libstdc++.nix
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,20 @@ topLevel@{ flake-parts-lib, lib, inputs, ... }: {
];
options.perSystem = flake-parts-lib.mkPerSystemOption
(perSystem@{ pkgs, system, ... }: {
ml-ops.common.pythonPackage = pythonPackage: {
override = [
(old: lib.attrsets.optionalAttrs old.stdenv.isLinux {
self = old.self.overrideAttrs
(self: super: {
env = super.env // {
# Link libstdc++ to python interpreter so that packages in manylinux ABI can find it out-of-the-box without LD_LIBRARY_PATH
# TODO: Add more libraries here when encountering an ImportError
NIX_LDFLAGS = "--no-as-needed -lstdc++ --as-needed ${super.env.NIX_LDFLAGS}";
};
});
})
ml-ops.common.pythonPackage = {
pipe = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what change are you making here? Are pipe and override attributes intrinsic to all nix objects? or are they just names you're using

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm. I see that they are just constructs that you've made and are manually handling / passing to pipe etc. etc.

Copy link
Contributor Author

@Atry Atry Nov 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lib.trivial.pipe is a nixpkgs function, and ml-ops.common.pythonPackage.pipe is an option of the arguments passed to lib.trivial.pipe. The pipe option introduced by this PR is more general than previous override and overrideAttrs options.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(python:
if python.stdenv.isLinux then
python.overrideAttrs
(self: super: {
env = super.env // {
# Link libstdc++ to python interpreter so that packages in manylinux ABI can find it out-of-the-box without LD_LIBRARY_PATH
# TODO: Add more libraries here when encountering an ImportError
NIX_LDFLAGS = "--no-as-needed -lstdc++ --as-needed ${super.env.NIX_LDFLAGS}";
};
})
else
python)
];
};
});
Expand Down
69 changes: 36 additions & 33 deletions flake-modules/gke-credential.nix
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,30 @@ topLevel@{ flake-parts-lib, inputs, lib, ... }: {
(kubernetes:
let
authModule = {
overrideAttrs = [
(old: {
gkeCluster = kubernetes.config.gke.cluster;
gkeRegion = kubernetes.config.gke.region;
USE_GKE_GCLOUD_AUTH_PLUGIN = "True";
buildCommand = ''
gcloud container clusters get-credentials \
"$gkeCluster" \
--region "$gkeRegion"
pipe = [
(previousPackage: previousPackage.overrideAttrs
(previousAttrs: {
gkeCluster = kubernetes.config.gke.cluster;
gkeRegion = kubernetes.config.gke.region;
USE_GKE_GCLOUD_AUTH_PLUGIN = "True";
buildCommand = ''
gcloud container clusters get-credentials \
"$gkeCluster" \
--region "$gkeRegion"

${old.buildCommand}
'';
nativeBuildInputs = old.nativeBuildInputs ++ [
pkgs.cacert
(
pkgs.google-cloud-sdk.withExtraComponents [
pkgs.google-cloud-sdk.components.gke-gcloud-auth-plugin
pkgs.google-cloud-sdk.components.kubectl
]
)
];
})
${previousAttrs.buildCommand}
'';
nativeBuildInputs = previousAttrs.nativeBuildInputs ++ [
pkgs.cacert
(
pkgs.google-cloud-sdk.withExtraComponents [
pkgs.google-cloud-sdk.components.gke-gcloud-auth-plugin
pkgs.google-cloud-sdk.components.kubectl
]
)
];
})
)
];
};
in
Expand All @@ -52,18 +54,19 @@ topLevel@{ flake-parts-lib, inputs, lib, ... }: {
type = lib.types.str;
};


config.pushImage.overrideAttrs = [
(old: {
buildCommand = ''
export skopeoCopyArgs="$(printf "%q " --dest-registry-token "$(gcloud auth print-access-token)")"
${old.buildCommand}
'';
nativeBuildInputs = old.nativeBuildInputs ++ [
pkgs.cacert
pkgs.google-cloud-sdk
];
})
config.pushImage.pipe = [
(previousPackage: previousPackage.overrideAttrs
(previousAttrs: {
buildCommand = ''
export skopeoCopyArgs="$(printf "%q " --dest-registry-token "$(gcloud auth print-access-token)")"
${previousAttrs.buildCommand}
'';
nativeBuildInputs = previousAttrs.nativeBuildInputs ++ [
pkgs.cacert
pkgs.google-cloud-sdk
];
})
)
];

config.helmUpgrade.imports = [
Expand Down
2 changes: 1 addition & 1 deletion flake-modules/link-nvidia-drivers.nix
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ topLevel@{ flake-parts-lib, inputs, ... }: {
)
then
# Create the symbolic links to drivers when running the container
# with --gpu=all
# with `nvidia-docker --gpus=all`
mkdir -p /run/opengl-driver/lib

(
Expand Down
28 changes: 8 additions & 20 deletions flake-modules/overridable-package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,14 @@ topLevel@{ flake-parts-lib, inputs, lib, ... }: {
type = lib.types.package;
example = pkgs.hello;
};
options.overridden-package = lib.mkOption {
type = lib.types.package;
defaultText = lib.literalMD "applied `base-package` with `override` and `overrideAttrs`";
default =
builtins.foldl'
(package: package.overrideAttrs)
(
builtins.foldl'
(package: package.override)
overridablePackage.config.base-package
overridablePackage.config.override
)
overridablePackage.config.overrideAttrs;
};
options.override = lib.mkOption {
type = lib.types.listOf lib.types.anything;
default = [ ];
};
options.overrideAttrs = lib.mkOption {
type = lib.types.listOf lib.types.anything;
options.overridden-package = lib.mkOption
{
type = lib.types.package;
defaultText = lib.literalMD "applied `base-package` with `pipe`";
default = lib.trivial.pipe overridablePackage.config.base-package overridablePackage.config.pipe;
};
options.pipe = lib.mkOption {
type = lib.types.listOf (lib.types.functionTo lib.types.package);
default = [ ];
};
})
Expand Down
Loading