llama.cpp Bindings

llama.cpp bindings are the python bindings to use llama.cpp. Simple Python bindings for llama.cpp library.

Usage

Example script : test-llamacpp-bindings.sh

#!/bin/bash

#SBATCH -J test-llamacpp-bindings
#SBATCH -e test-llamacpp-bindings-%j.err
#SBATCH -o test-llamacpp-bindings-%j.msg
#SBATCH -p rasmia_hopper # queue (partition)
#SBATCH --nodelist=agpuh03 # select one node
#SBATCH --gres=gpu:1 # select one GPU

module load llama-cpp-python/0.3.1

export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.trust.crt
pip install typing-extensions diskcache certifi

# check available GPUs
nvidia-smi

python test-llamacpp-bindings.py

echo "DONE!"

Submit with :

sbatch --account=your_project_ID test-llamacpp-bindings.sh

Example file : test-llamacpp-bindings.py

import os
import urllib.request
from llama_cpp import Llama

def download_file(file_link, filename):
    # Checks if the file already exists before downloading
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(file_link, filename)
        print("File downloaded successfully.")
    else:
        print("File already exists.")

# Dowloading GGML model from HuggingFace
ggml_model_path = "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q4_0.gguf"
filename = "zephyr-7b-beta.Q4_0.gguf"

download_file(ggml_model_path, filename)

llm = Llama(model_path="zephyr-7b-beta.Q4_0.gguf", n_ctx=512, n_batch=126, n_gpu_layers=-1)

def generate_text(
    prompt="Who is the CEO of Apple?",
    max_tokens=256,
    temperature=0.1,
    top_p=0.5,
    echo=False,
    stop=["#"],
):
    output = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    output_text = output["choices"][0]["text"].strip()
    return output_text

def generate_prompt_from_template(input):
    chat_prompt_template = f"""<|im_start|>system
You are a helpful chatbot.<|im_end|>
<|im_start|>user
{input}<|im_end|>"""
    return chat_prompt_template

prompt = generate_prompt_from_template(
    "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions."
)

print(generate_text(
    prompt,
    max_tokens=356,
))

More info :

https://github.com/abetlen/llama-cpp-python

BIFI Documentation

llama.cpp Bindings

Usage