NVIDIA HPC SDK
A Comprehensive Suite of Compilers, Libraries and Tools for HPC. The NVIDIA HPC Software Development Kit (SDK) includes the proven compilers, libraries and software tools essential to maximizing developer productivity and the performance and portability of HPC applications.
IMPORTANT: If you make use of GPU queues, it is mandatory to select a minimum number of CPU cores given by 16 cores/GPU multiplied by the amount of nodes selected and the number of selected GPUs (16 cores/gpu * num. GPUs * num. nodes).
Usage
Example script : testNvidiaHpcSdk.sh
#!/bin/bash
#SBATCH -e nvidiahpcsdk_test%j.err
#SBATCH -o nvidiahpcsdk_test%j.msg
#SBATCH -p thin
#SBATCH -N 4
#SBATCH --ntasks-per-node=24
module load nvidia-hpc-sdk/24.5
nvcc --help
nvcc --version
Submit with :
sbatch --account=your_project_ID testNvidiaHpcSdk.sh
Example script (Running in ada queue) : sumParallel.sh
#!/bin/bash
#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p ada # queue (partition)
#SBATCH --nodes=1
#SBATCH --gres=gpu:2 # launch in 2-GPUs
#SBATCH --cpus-per-task=32 # 16 cores por GPU * 2 GPUs * 1 nodo
module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi
mkdir -p /fs/agustina/$(whoami)/test-multigpu
export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
$CUDA_SUM_CODE/./sum-array-multigpu
Example script (Running in hopper queue) : sumParallel.sh
#!/bin/bash
#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p hopper # queue (partition)
#SBATCH --nodelist=agpuh02
#SBATCH --gres=gpu:4 # launch in 4-GPUs
#SBATCH --cpus-per-task=64 # 16 cores por GPU * 4 GPUs * 1 nodo
module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi
export CUDA_SUM_CODE=/fs/agustina/sergiomtzlosa/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
$CUDA_SUM_CODE/./sum-array-multigpu
Execution in selected nodes: sumParallel.sh
#!/bin/env bash
#SBATCH -J multgpu-test # job name
#SBATCH -o multi-gpu.o%j # output and error file name (%j expands to jobID)
#SBATCH -p hopper # H100 (partition)
#SBATCH --gres=gpu:4 # gpus per node
#SBATCH -–nodelist=agpuh[02-03] # two nodes (node 2 and 3)
#SBATCH --ntasks=2 # one task per node
#SBATCH --cpus-per-task=128 # 16 cores por GPU * 4 GPUs * 2 nodos
module load nvidia-hpc-sdk/24.5
export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
mpirun $CUDA_SUM_CODE/./sum-array-multigpu
Submit with :
sbatch --account=your_project_ID sumParallel.sh
CUDA file : sum-array-multigpu.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
// for random initialize
#include <stdlib.h>
#include <time.h>
// for memeset
#include <cstring>
void printGpuInfo(int i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
void compare_arrays(int *a, int *b, int size) {
for (int i = 0; i < size; i++) {
if (a[i] != b[i]) {
printf("%d != %d\n", a[i], b[i]);
printf("Arrays are different!\n\n");
return;
}
}
printf("Arrays are the same!\n\n");
}
// CUDA Kernel
__global__ void sum_array_gpu(int *a, int *b, int *c, int size) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size) {
c[gid] = a[gid] + b[gid];
}
}
void sum_array_cpu(int *a, int *b, int *c, int size) {
for (int i = 0; i < size; i++) {
c[i] = a[i] + b[i];
}
}
int main() {
int size = 10000;
int block_size = 128;
int nDevices;
int NO_BYTES = size * sizeof(int);
// host pointers
int *h_a, *h_b, *gpu_results, *h_c;
h_a = (int *)malloc(NO_BYTES);
h_b = (int *)malloc(NO_BYTES);
h_c = (int *)malloc(NO_BYTES);
// initialize host pointer
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i < size; i++) {
h_a[i] = (int)(rand() & 0xff);
}
for (int i = 0; i < size; i++) {
h_b[i] = (int)(rand() & 0xff);
}
sum_array_cpu(h_a, h_b, h_c, size);
cudaGetDeviceCount(&nDevices);
// device pointer
int *d_a, *d_b, *d_c;
for (int dev = 0; dev < nDevices; dev++) {
printGpuInfo(dev);
cudaSetDevice(dev);
gpu_results = (int *)malloc(NO_BYTES);
memset(gpu_results, 0 , NO_BYTES);
cudaMalloc((int **)&d_a, NO_BYTES);
cudaMalloc((int **)&d_b, NO_BYTES);
cudaMalloc((int **)&d_c, NO_BYTES);
cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);
// launching the grid
dim3 block(block_size);
dim3 grid((size/block.x) + 1);
sum_array_gpu<<<grid, block>>>(d_a, d_b, d_c, size);
cudaDeviceSynchronize();
cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost);
// array comparison
compare_arrays(gpu_results, h_c, size);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(gpu_results);
}
free(h_a);
free(h_b);
}
For a multigpu and multinode launch, it is mandatory to use OpenMPI, here is the same bash script with multigpu support:
#SBATCH -J multi-gpu-test
#SBATCH -e multigpu-test%j.err
#SBATCH -o multigpu-test%j.msg
#SBATCH -p ada # queue L40S (partition)
#SBATCH --gres=gpu:4 # gpus per node
#SBATCH --nodes=4 # four nodes
#SBATCH --ntasks=4 # one task per node
#SBATCH --cpus-per-task=256 # 16 cores por GPU * 4 GPUs * 4 nodos
echo "This bash script launches the program on 16 GPUs"
module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
nvidia-smi
export CUDA_SUM_CODE=/fs/agustina/$(whoami)/test-multigpu
nvcc $CUDA_SUM_CODE/sum-array-multigpu.cu -o $CUDA_SUM_CODE/sum-array-multigpu
mpirun $CUDA_SUM_CODE/./sum-array-multigpu
Submit with :
sbatch --account=your_project_ID sumParallel.sh
OLLAMA execution in Agustina
To use OLLAMA in Agustina it is mandatory to obtain the OLLAMA binary file for Linux distributions:
Uncompress the ollama-linux-amd64.tgz
file and create directories:
$ mkdir -p /fs/agustina/$(whoami)/test-ollama/prompts # prompts folder
$ mkdir -p /fs/agustina/$(whoami)/test-ollama/models-ollama # new folder to download OLLAMA models
$ cd /fs/agustina/$(whoami)/test-ollama
$ wget https://github.com/ollama/ollama/releases/download/v0.3.14/ollama-linux-amd64.tgz
$ tar xvf ollama-linux-amd64.tgz
OLLAMA uses the /home
directory to store the models, indeed in $HOME/.ollama/models
, but we change this path by setting the OLLAMA_MODELS
environment variable.
Here is an example of running OLLAMA model in Agustina cluster on H100 GPUs:
bash file : test-ollama.sh
#!/bin/bash
#SBATCH -J ollama-gpu-test
#SBATCH -e ollama-test%j.err
#SBATCH -o ollama-test%j.msg
#SBATCH -p hopper # H100 queue (partition)
#SBATCH --nodelist=agpuh02
#SBATCH --gres=gpu:4 # four GPUs
#SBATCH --cpus-per-task=64 # 16 cores por GPU * 4 GPUs * 1 nodo
module load nvidia-hpc-sdk/24.5
echo "Loaded NVIDIA SDK !!!"
module load python-math/3.11.4
python --version
nvidia-smi
echo "Current path: $(pwd)"
export BASE_OLLAMA_TEST=/fs/agustina/$(whoami)/test-ollama
export PROMPTS_PATH=$BASE_OLLAMA_TEST/prompts
export OLLAMA_BIN=$BASE_OLLAMA_TEST/bin
echo "OLLAMA PATH: $OLLAMA_BIN"
export OLLAMA_NUMPARALLEL=4
export OLLAMA_LOAD_TIMEOUT=900
# change the models download path with this environment variable
export OLLAMA_MODELS=$BASE_OLLAMA_TEST/models-ollama
$OLLAMA_BIN/./ollama serve &
$OLLAMA_BIN/./ollama list
for i in llama3.1:8b-instruct-q2_K llama3.1:8b-instruct-q8_0; do
touch answer1-$i.txt && >answer1-$i.txt
echo "" >> answer1-$i.txt
echo "PROMPT:" >> answer1-$i.txt
echo "" >> answer1-$i.txt
cat $PROMPTS_PATH/prompt1.txt >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo "ANSWER:" >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo $(< $PROMPTS_PATH/prompt1.txt) | $OLLAMA_BIN/./ollama run $i >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo "--------------------------------" >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo "PROMPT:" >> answer1-$i.txt
echo "" >> answer1-$i.txt
cat $PROMPTS_PATH/prompt2.txt >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo "ANSWER:" >> answer1-$i.txt
echo "" >> answer1-$i.txt
echo $(< $PROMPTS_PATH/prompt2.txt) | $OLLAMA_BIN/./ollama run $i >> answer1-$i.txt
done
$OLLAMA_BIN/./ollama list
echo "DONE!"
The script takes the input prompts from files $PROMPTS_PATH/prompt1.txt
and $PROMPTS_PATH/prompt2.txt
.
Submit with :
sbatch --account=your_project_ID test-ollama.sh
More info :