Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 134 additions & 3 deletions scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -447,9 +447,10 @@ find_cuda_libraries_on_host() {
fi
}

# Symlink structure changed from 2025.06 onwards. This function reflects the symlinking as it was done for EESSI 2023.06
# Actually symlinks the Matched libraries to correct folders.
# Then also creates "host" and "latest" folder symlinks
symlink_mode () {
symlink_mode_202306 () {
# First let's make sure the driver libraries are not already in place
# Have to link drivers = True
link_drivers=1
Expand Down Expand Up @@ -617,6 +618,132 @@ symlink_mode () {

}


# Symlink structure changed from 2025.06 onwards. This function reflects the new symlinking
# Actually symlinks the Matched libraries to correct folders.
symlink_mode () {
# First let's make sure the driver libraries are not already in place
# Have to link drivers = True
link_drivers=1

# Do some checks on existence of links and that we don't end up at /dev/null (the default), so we can print some informative information
# One downside is that we can't explicitely check if something is a variant symlink, so we'll just assume that if it's a link AND it
# lives in our CVMFS repository, it must be a variant symlink
nvidia_trusted_dir="${EESSI_EPREFIX}/lib/nvidia"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that the script will no longer work for 2023.06?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, yeah, that's annoying, this script is in an unversioned prefix. I mean, if we deploy this only for 2025.06, we keep the old version for 2023.06. But then if we want to update that, we have to revert all changes, etc. Maybe we should just duplicate the script? I.e. create something like scripts/gpu_support/nvidia/2023.06/link_nvidia_host_libraries.sh? Or should it be at higher level 2023.06/scripts/gpu_support...?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I did solve it differently in the end. Only the symlink_mode function changed. I now just duplicated it, and call the correct one based on the value of EESSI_VERSION (note that check_eessi_initialized is called before symlink_mode, so we can be sure that EESSI is initialized and that EESSI_VERSION has been set by the time it gets to that part of the script).

The duplication means there is some code duplication - but actually not to much: the function has changed substantially, so I think the small amount of duplication that is still left is acceptable - especially since it's unlikely we'll still change anything on the 2023.06 side.

if [[ -L "$nvidia_trusted_dir" ]]; then
target1=$(readlink "$nvidia_trusted_dir")
log_verbose "$nvidia_trusted_dir is a CVMFS variant symlink (EESSI_${EESSI_VERSION//./}_NVIDIA_OVERRIDE) currently pointing to $target1"
# If this is a link, and if it lives in the EESSI_CVMFS_REPO, we assume this is a variant symlink
if [[ -L "$target1" && "$target1" == "$EESSI_CVMFS_REPO"/* ]]; then
target2=$(readlink "$target1")
msg="${target1} appears to be a CVMFS variant symlink (EESSI_NVIDIA_OVERRIDE_DEFAULT) currently pointing to ${target2}."
msg="${msg} Proceeding to install host symlinks in ${target2}."
log_verbose "${msg}"

# Check if target2 isn't /dev/null (the default target of the EESSI_NVIDIA_OVERRIDE_DEFAULT variant symlink)
# If it is, suggest setting EESSI_NVIDIA_OVERRIDE_DEFAULT or EESSI_${EESSI_VERSION//./}_NVIDIA_OVERRIDE
if [[ $target2 == /dev/null ]]; then
msg="${nvidia_trusted_dir} is a symlink pointing to ${target1}, which is a symlink pointing to ${target2}\n"
msg="${msg}If you want to symlink the drivers in a single location for all EESSI versions, please define"
msg="${msg} the EESSI_NVIDIA_OVERRIDE_DEFAULT variant symlink in your local CVMFS configuration to point to"
msg="${msg} writeable location. This will change the target of symlink ${target1}.\n"
msg="${msg}If you want to symlink the drivers only for this version of EESSI (${EESSI_VERSION}), please define"
msg="${msg} the EESSI_${EESSI_VERSION//./}_NVIDIA_OVERRIDE variant symlink in your local CVMFS configuration to point to"
msg="${msg} writeable location. This will change the target of symlink ${nvidia_trusted_dir}.\n"
fatal_error "${msg}"
fi
else
msg="$target1 does not seem to be a CVMFS variant symlink, suggesting that EESSI_${EESSI_VERSION//./}_NVIDIA_OVERRIDE"
msg="${msg} was set in the CVMFS config. Proceeding to install host symlinks in $target1."
log_verbose "${msg}"
fi
else
msg="$nvidia_trusted_dir is expected to be a symlink, but it's not. This will likely fail"
msg="${msg} as CVMFS repositories are read-only. Proceeding anyway, but expect this to fail."
echo_yellow "${msg}"
fi

# Make sure that target of nvidia_trusted_dir variant symlink is an existing directory
install_target=$(readlink -f "$nvidia_trusted_dir")
echo "Ensure the final target of ${nvidia_trusted_dir} (${install_target}) exists"
log_verbose "Target directory in which driver symlinks will be installed: ${install_target}"
if [ ! -d "$install_target" ]; then
check_global_read
if ! create_directory_structure "$install_target"; then
fatal_error "No write permissions to directory ${install_target}"
fi
fi

# Define file to store driver version that was symlinked
host_injection_driver_version_file="${install_target}/driver_version.txt"
log_verbose "host_injection_driver_version_file: ${host_injection_driver_version_file}"

# Check if drivers are already linked with correct version
# This is done by comparing host_injection_driver_version_file (driver_version.txt)
# This is needed when updating GPU drivers.
if [ -e "$host_injection_driver_version_file" ]; then
if grep -q "$HOST_GPU_DRIVER_VERSION" "$host_injection_driver_version_file"; then
echo_green "The host GPU driver libraries (v${HOST_GPU_DRIVER_VERSION}) have already been linked! (based on ${host_injection_driver_version_file})"
# The GPU libraries were already linked for this version of CUDA driver
# Have to link drivers = False
link_drivers=0
else
# There's something there but it is out of date
echo_yellow "The host GPU driver libraries version have changed. Now its: (v${HOST_GPU_DRIVER_VERSION})"
echo_yellow "Cleaning out outdated symlinks."
rm "${install_target}"/* || fatal_error "Unable to remove files under '${install_target}'."
fi
fi

# Link all matched_libraries from Nvidia to correct host_injection folder
# This step is only run, when linking of drivers is needed (eg. link_drivers==1)
# Setup variable to track if some drivers were actually linked this run.
drivers_linked=0

# Have to link drivers
if [ "$link_drivers" -eq 1 ]; then
# Link the matched libraries

cd "${install_target}" || fatal_error "Failed to cd to ${install_target}"
log_verbose "Changed directory to: $PWD"

# Make symlinks to all the interesting libraries
# Loop over each matched library
for library in "${MATCHED_LIBRARIES[@]}"; do
log_verbose "Linking library: ${library}"

# Get just the library filename
lib_name=$(basename "$library")

# Check if the symlink already exists
if [ -L "$lib_name" ]; then
# Check if it's pointing to the same target
target=$(readlink "$lib_name")
if [ "$target" = "$library" ]; then
log_verbose "Symlink for $lib_name already exists and points to correct target"
continue
else
log_verbose "Symlink for $lib_name exists but points to wrong target: $target, updating..."
rm "$lib_name"
fi
fi

# Create a symlink in the current directory
# and check if the symlink was created successfully
if ! ln -s "$library" .
then
fatal_error "Error: Failed to create symlink for library $library in $PWD"
fi
done

# Inject driver and CUDA versions into the directory
echo "$HOST_GPU_DRIVER_VERSION" > driver_version.txt
echo "$HOST_GPU_CUDA_VERSION" > cuda_version.txt

drivers_linked=1
fi
}

# Logging function for verbose mode
# TODO: move to utils?
log_verbose() {
Expand All @@ -635,7 +762,7 @@ check_eessi_initialised

# Verify nvidia-smi availability
log_verbose "Checking for nvidia-smi command..."
command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; return 1; }
command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit 1; }

# Parse command line arguments
while [[ "$#" -gt 0 ]]; do
Expand Down Expand Up @@ -685,7 +812,11 @@ fi

# === 5b. Symlink Mode ===
# If we haven't already exited, we may need to create the symlinks
symlink_mode
if [ "$EESSI_VERSION" == '2023.06' ]; then
symlink_mode_202306
else
symlink_mode
fi

# If everything went OK, show success message
echo_green "Host NVIDIA GPU drivers linked successfully for EESSI"
Expand Down