Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nvidia-container-toolkit: only mount existing paths in the host #319772

Merged
ereslibre marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
inherit hostPath containerPath;
options = mountOptions;
};
jqAddMountExpression = ".containerEdits.mounts[.containerEdits.mounts | length] |= . +";
allJqMounts = lib.concatMap
(mount:
["${lib.getExe jq} '${jqAddMountExpression} ${builtins.toJSON (mkMount mount)}'"])
mounts;
mountToCommand = mount:
"additionalMount \"${mount.hostPath}\" \"${mount.containerPath}\" '${builtins.toJSON mount.mountOptions}'";
mountsToCommands = mounts:
if (builtins.length mounts) == 0 then
"cat"
else
(lib.strings.concatMapStringsSep " | \\\n"
mountToCommand mounts);
in
writeScriptBin "nvidia-cdi-generator"
''
Expand All @@ -32,6 +35,18 @@ function cdiGenerate {
--nvidia-ctk-path ${lib.getExe' nvidia-container-toolkit "nvidia-ctk"}
}

cdiGenerate | \
${lib.concatStringsSep " | " allJqMounts} > $RUNTIME_DIRECTORY/nvidia-container-toolkit.json
function additionalMount {
Copy link
Contributor

@SomeoneSerge SomeoneSerge Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well one comment is that this could be rewritten with pythonMinimal or a similar language that doesn't try so much to make this painful (you could remove concatMapStringsSep and just export a json for the script)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But as commented before, if you say this is the current iteration I'll go with it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'll work on a reimplementation of this logic with Python in future PR's, if that's fine with you.

local hostPath="$1"
local containerPath="$2"
local mountOptions="$3"
if [ -e "$hostPath" ]; then
${lib.getExe jq} ".containerEdits.mounts[.containerEdits.mounts | length] = { \"hostPath\": \"$hostPath\", \"containerPath\": \"$containerPath\", \"options\": $mountOptions }"
else
echo "Mount $hostPath ignored: could not find path in the host machine" >&2
cat
fi
}

cdiGenerate |
${mountsToCommands mounts} > $RUNTIME_DIRECTORY/nvidia-container-toolkit.json
''
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
/usr/local/nvidia/lib64.
'';
};

package = lib.mkPackageOption pkgs "nvidia-container-toolkit" { };
};

};
Expand Down Expand Up @@ -129,6 +131,7 @@
let
script = pkgs.callPackage ./cdi-generate.nix {
inherit (config.hardware.nvidia-container-toolkit) mounts;
nvidia-container-toolkit = config.hardware.nvidia-container-toolkit.package;
nvidia-driver = config.hardware.nvidia.package;
deviceNameStrategy = config.hardware.nvidia-container-toolkit.device-name-strategy;
};
Expand Down
1 change: 1 addition & 0 deletions nixos/tests/all-tests.nix
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ in {
ntfy-sh = handleTest ./ntfy-sh.nix {};
ntfy-sh-migration = handleTest ./ntfy-sh-migration.nix {};
ntpd-rs = handleTest ./ntpd-rs.nix {};
nvidia-container-toolkit = runTest ./nvidia-container-toolkit.nix;
nvmetcfg = handleTest ./nvmetcfg.nix {};
nzbget = handleTest ./nzbget.nix {};
nzbhydra2 = handleTest ./nzbhydra2.nix {};
Expand Down
149 changes: 149 additions & 0 deletions nixos/tests/nvidia-container-toolkit.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
{ pkgs, lib, ... }:
let
testCDIScript = pkgs.writeShellScriptBin "test-cdi" ''
die() {
echo "$1"
exit 1
}

check_file_referential_integrity() {
echo "checking $1 referential integrity"
( ${pkgs.glibc.bin}/bin/ldd "$1" | ${lib.getExe pkgs.gnugrep} "not found" &> /dev/null ) && return 1
return 0
}

check_directory_referential_integrity() {
${lib.getExe pkgs.findutils} "$1" -type f -print0 | while read -d $'\0' file; do
if [[ $(${lib.getExe pkgs.file} "$file" | ${lib.getExe pkgs.gnugrep} ELF) ]]; then
check_file_referential_integrity "$file" || exit 1
else
echo "skipping $file: not an ELF file"
fi
done
}

check_directory_referential_integrity "/usr/bin" || exit 1
check_directory_referential_integrity "${pkgs.addDriverRunpath.driverLink}" || exit 1
check_directory_referential_integrity "/usr/local/nvidia" || exit 1
'';
testContainerImage = pkgs.dockerTools.buildImage {
name = "cdi-test";
tag = "latest";
config = {
Cmd = [ (lib.getExe testCDIScript) ];
};
copyToRoot = with pkgs.dockerTools; [
usrBinEnv
binSh
];
};
emptyCDISpec = ''
{
"cdiVersion": "0.5.0",
"kind": "nvidia.com/gpu",
"devices": [
{
"name": "all",
"containerEdits": {
"deviceNodes": [
{
"path": "/dev/urandom"
}
],
"hooks": [],
"mounts": []
}
}
],
"containerEdits": {
"deviceNodes": [],
"hooks": [],
"mounts": []
}
}
'';
nvidia-container-toolkit = {
enable = true;
package = pkgs.stdenv.mkDerivation {
pname = "nvidia-ctk-dummy";
version = "1.0.0";
dontUnpack = true;
dontBuild = true;

inherit emptyCDISpec;
passAsFile = [ "emptyCDISpec" ];

installPhase = ''
mkdir -p $out/bin $out/share/nvidia-container-toolkit
cp "$emptyCDISpecPath" "$out/share/nvidia-container-toolkit/spec.json"
echo -n "$emptyCDISpec" > "$out/bin/nvidia-ctk";
cat << EOF > "$out/bin/nvidia-ctk"
#!${pkgs.runtimeShell}
cat "$out/share/nvidia-container-toolkit/spec.json"
EOF
chmod +x $out/bin/nvidia-ctk
'';
meta.mainProgram = "nvidia-ctk";
};
};
in
{
name = "nvidia-container-toolkit";
meta = with lib.maintainers; {
maintainers = [ ereslibre ];
};
defaults =
{ config, ... }:
{
environment.systemPackages = with pkgs; [ jq ];
virtualisation.diskSize = lib.mkDefault 10240;
virtualisation.containers.enable = lib.mkDefault true;
hardware = {
inherit nvidia-container-toolkit;
nvidia = {
open = true;
package = config.boot.kernelPackages.nvidiaPackages.stable.open;
};
graphics.enable = lib.mkDefault true;
};
};
nodes = {
no-gpus = {
virtualisation.containers.enable = false;
hardware.graphics.enable = false;
};
one-gpu =
{ pkgs, ... }:
{
environment.systemPackages = with pkgs; [ podman ];
hardware.graphics.enable = true;
};

one-gpu-invalid-host-paths = {
hardware.nvidia-container-toolkit.mounts = [
{
hostPath = "/non-existant-path";
containerPath = "/some/path";
}
];
};
};
testScript = ''
start_all()

with subtest("Generate an empty CDI spec for a machine with no Nvidia GPUs"):
no_gpus.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
no_gpus.succeed("cat /var/run/cdi/nvidia-container-toolkit.json | jq")

with subtest("Podman loads the generated CDI spec for a machine with an Nvidia GPU"):
one_gpu.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
one_gpu.succeed("cat /var/run/cdi/nvidia-container-toolkit.json | jq")
one_gpu.succeed("podman load < ${testContainerImage}")
print(one_gpu.succeed("podman run --pull=never --device=nvidia.com/gpu=all -v /run/opengl-driver:/run/opengl-driver:ro cdi-test:latest"))

# Issue: https://github.com/NixOS/nixpkgs/issues/319201
with subtest("The generated CDI spec skips specified non-existant paths in the host"):
one_gpu_invalid_host_paths.wait_for_unit("nvidia-container-toolkit-cdi-generator.service")
one_gpu_invalid_host_paths.fail("grep 'non-existant-path' /var/run/cdi/nvidia-container-toolkit.json")
'';
}