NVIDIA HPC SDK

A Comprehensive Suite of Compilers, Libraries and Tools for HPC. The NVIDIA HPC Software Development Kit (SDK) includes the proven compilers, libraries and software tools essential to maximizing developer productivity and the performance and portability of HPC applications.

IMPORTANT: If you make use of GPU queues, it is mandatory to select a minimum number of CPU cores given by 16 cores/GPU multiplied by the amount of nodes selected and the number of selected GPUs (16 cores/gpu * num. GPUs * num. nodes).

Usage

Example script : testNvidiaHpcSdk.sh

#!/bin/bash

#SBATCH -e nvidiahpcsdk_test%j.err
#SBATCH -o nvidiahpcsdk_test%j.msg
#SBATCH -p agustina_thin
#SBATCH -N 4
#SBATCH --ntasks-per-node=24

module load nvidia-hpc-sdk/24.5

nvcc --help
nvcc --version

Submit with :

sbatch --account=your_project_ID testNvidiaHpcSdk.sh

Example script (Running in ada queue) : sumParallel.sh

#!/bin/bash

#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p rasmia_ada # queue (partition)
#SBATCH --nodes=1
#SBATCH --gres=gpu:2 # launch in 2-GPUs
#SBATCH --cpus-per-task=32 # 16 cores por GPU * 2 GPUs * 1 nodo

module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi

mkdir -p /fs/agustina/$(whoami)/test-multigpu
export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
$CUDA_SUM_CODE/./sum-array-multigpu

Example script (Running in hopper queue) : sumParallel.sh

#!/bin/bash

#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p rasmia_hopper # queue (partition)
#SBATCH --nodelist=agpuh02
#SBATCH --gres=gpu:4 # launch in 4-GPUs
#SBATCH --cpus-per-task=64 # 16 cores por GPU * 4 GPUs * 1 nodo

module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi

export CUDA_SUM_CODE=/fs/agustina/sergiomtzlosa/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
$CUDA_SUM_CODE/./sum-array-multigpu

Execution in selected nodes: sumParallel.sh

#!/bin/env bash

#SBATCH -J multgpu-test # job name
#SBATCH -o multi-gpu.o%j # output and error file name (%j expands to jobID)
#SBATCH -p rasmia_hopper # H100 (partition)
#SBATCH --gres=gpu:4 # gpus per node
#SBATCH -–nodelist=agpuh[02-03] # two nodes (node 2 and 3)
#SBATCH --ntasks=2 # one task per node
#SBATCH --cpus-per-task=128 # 16 cores por GPU * 4 GPUs * 2 nodos

module load nvidia-hpc-sdk/24.5
export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
mpirun $CUDA_SUM_CODE/./sum-array-multigpu

Submit with :

sbatch --account=your_project_ID sumParallel.sh

CUDA file : sum-array-multigpu.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

// for random initialize
#include <stdlib.h>
#include <time.h>

// for memeset
#include <cstring>

void printGpuInfo(int i) {

	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, i);
	printf("Device Number: %d\n", i);
	printf("  Device name: %s\n", prop.name);
	printf("  Memory Clock Rate (KHz): %d\n",  prop.memoryClockRate);
	printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
	printf("  Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}

void compare_arrays(int *a, int *b, int size) {
	for (int i = 0; i < size; i++) {
		if (a[i] != b[i]) {
			printf("%d != %d\n", a[i], b[i]);
			printf("Arrays are different!\n\n");
			return;
		}
	}
	printf("Arrays are the same!\n\n");
}

// CUDA Kernel
__global__ void sum_array_gpu(int *a, int *b, int *c, int size) {
	int gid = blockIdx.x * blockDim.x + threadIdx.x;

	if (gid < size) {
		c[gid] = a[gid] + b[gid];
	}
}

void sum_array_cpu(int *a, int *b, int *c, int size) {
	for (int i = 0; i < size; i++) {
		c[i] = a[i] + b[i];
	}
}

int main() {

	int size = 10000;
	int block_size = 128;
	int nDevices;
	int NO_BYTES = size * sizeof(int);

	// host pointers
	int *h_a, *h_b, *gpu_results, *h_c;

	h_a = (int *)malloc(NO_BYTES);
	h_b = (int *)malloc(NO_BYTES);
	h_c = (int *)malloc(NO_BYTES);

	// initialize host pointer
	time_t t;
	srand((unsigned)time(&t));

	for (int i = 0; i < size; i++) {
		h_a[i] = (int)(rand() & 0xff);
	}

	for (int i = 0; i < size; i++) {
		h_b[i] = (int)(rand() & 0xff);
	}

	sum_array_cpu(h_a, h_b, h_c, size);

	cudaGetDeviceCount(&nDevices);

	// device pointer
	int *d_a, *d_b, *d_c;

	for (int dev = 0; dev < nDevices; dev++) {

		printGpuInfo(dev);
		cudaSetDevice(dev);

		gpu_results = (int *)malloc(NO_BYTES);
		memset(gpu_results, 0 , NO_BYTES);

		cudaMalloc((int **)&d_a, NO_BYTES);
		cudaMalloc((int **)&d_b, NO_BYTES);
		cudaMalloc((int **)&d_c, NO_BYTES);

		cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
		cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);

		// launching the grid
		dim3 block(block_size);
		dim3 grid((size/block.x) + 1);

		sum_array_gpu<<<grid, block>>>(d_a, d_b, d_c, size);
		cudaDeviceSynchronize();

		cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost);

		// array comparison
		compare_arrays(gpu_results, h_c, size);

		cudaFree(d_a);
		cudaFree(d_b);
		cudaFree(d_c);
		free(gpu_results);
	}

	free(h_a);
	free(h_b);

}

For a multigpu and multinode launch, it is mandatory to use OpenMPI, here is the same bash script with multigpu support:

#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p rasmia_ada # queue L40S (partition)
#SBATCH --gres=gpu:4 # gpus per node
#SBATCH --nodes=4 # four nodes
#SBATCH --ntasks=4 # one task per node
#SBATCH --cpus-per-task=256 # 16 cores por GPU * 4 GPUs * 4 nodos

echo "This bash script launches the program on 16 GPUs"

module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi

export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
mpirun $CUDA_SUM_CODE/./sum-array-multigpu

Submit with :

sbatch --account=your_project_ID sumParallel.sh

OLLAMA execution in Agustina

To use OLLAMA in Agustina it is mandatory to obtain the OLLAMA binary file for Linux distributions:

https://github.com/ollama/ollama/releases/download/v0.3.14/ollama-linux-amd64.tgz

Uncompress the ollama-linux-amd64.tgz file and create directories:

$ mkdir -p /fs/agustina/$(whoami)/test-ollama/prompts # prompts folder
$ mkdir -p /fs/agustina/$(whoami)/test-ollama/models-ollama # new folder to download OLLAMA models
$ cd /fs/agustina/$(whoami)/test-ollama
$ wget https://github.com/ollama/ollama/releases/download/v0.3.14/ollama-linux-amd64.tgz
$ tar xvf ollama-linux-amd64.tgz

OLLAMA uses the /home directory to store the models, indeed in $HOME/.ollama/models, but we change this path by setting the OLLAMA_MODELS environment variable.

Here is an example of running OLLAMA model in Agustina cluster on H100 GPUs:

bash file : test-ollama.sh

#!/bin/bash

#SBATCH -J ollama-gpu-test
#SBATCH -e ollama-test%j.err
#SBATCH -o ollama-test%j.msg
#SBATCH -p rasmia_hopper # H100 queue (partition)
#SBATCH --nodelist=agpuh02
#SBATCH --gres=gpu:4 # four GPUs
#SBATCH --cpus-per-task=64 # 16 cores por GPU * 4 GPUs * 1 nodo

module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"

module load python-math/3.11.4

python --version
nvidia-smi

echo "Current path: $(pwd)"

export BASE_OLLAMA_TEST=/fs/agustina/$(whoami)/test-ollama
export PROMPTS_PATH=$BASE_OLLAMA_TEST/prompts
export OLLAMA_BIN=$BASE_OLLAMA_TEST/bin

echo "OLLAMA PATH: $OLLAMA_BIN"

export OLLAMA_NUMPARALLEL=4
export OLLAMA_LOAD_TIMEOUT=900

# change the models download path with this environment variable
export OLLAMA_MODELS=$BASE_OLLAMA_TEST/models-ollama

$OLLAMA_BIN/./ollama serve &
$OLLAMA_BIN/./ollama list

for i in llama3.1:8b-instruct-q2_K llama3.1:8b-instruct-q8_0; do
        touch answer1-$i.txt && >answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo "PROMPT:" >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        cat $PROMPTS_PATH/prompt1.txt >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo "ANSWER:" >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo $(< $PROMPTS_PATH/prompt1.txt) | $OLLAMA_BIN/./ollama run $i >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo "--------------------------------" >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo "PROMPT:" >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        cat $PROMPTS_PATH/prompt2.txt >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo "ANSWER:" >> answer1-$i.txt
        echo "" >> answer1-$i.txt
        echo $(< $PROMPTS_PATH/prompt2.txt) | $OLLAMA_BIN/./ollama run $i >> answer1-$i.txt
done

$OLLAMA_BIN/./ollama list

echo "DONE!"

The script takes the input prompts from files $PROMPTS_PATH/prompt1.txt and $PROMPTS_PATH/prompt2.txt.

Submit with :

sbatch --account=your_project_ID test-ollama.sh

More info :

https://developer.nvidia.com/hpc-sdk

Documentacion BIFI

NVIDIA HPC SDK

Usage

OLLAMA execution in Agustina