Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions examples/simple/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
use llama_cpp_2::llama_backend::LlamaBackend;
use llama_cpp_2::llama_batch::LlamaBatch;
use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
use llama_cpp_2::model::params::LlamaModelParams;
use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode};
use llama_cpp_2::model::LlamaModel;
use llama_cpp_2::model::{AddBos, Special};
use llama_cpp_2::sampling::LlamaSampler;
Expand Down Expand Up @@ -48,6 +48,23 @@ struct Args {
#[cfg(any(feature = "cuda", feature = "vulkan"))]
#[clap(long)]
disable_gpu: bool,
/// Set main GPU device index (default: 0)
///
/// By setting this option, multiple GPU is disabled.
#[arg(
long,
help = "Set main GPU device id (default: 0). Disables multi-GPU."
)]
main_gpu: Option<i32>,
/// Set devices to use by index
///
/// This option overrides `main-gpu` and enables multi-GPU.
#[arg(
long,
value_delimiter = ',',
help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
)]
devices: Option<Vec<usize>>,
#[cfg(any(feature = "cuda", feature = "vulkan"))]
#[arg(long, help = "Keep MoE layers on CPU")]
cmoe: bool,
Expand All @@ -72,6 +89,8 @@ struct Args {
ctx_size: Option<NonZeroU32>,
#[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
verbose: bool,
#[arg(long, help = "list backend devices")]
list_devices: bool,
}

/// Parse a single key-value pair
Expand Down Expand Up @@ -132,6 +151,8 @@ fn main() -> Result<()> {
file,
#[cfg(any(feature = "cuda", feature = "vulkan"))]
disable_gpu,
main_gpu,
devices,
#[cfg(any(feature = "cuda", feature = "vulkan"))]
cmoe,
key_value_overrides,
Expand All @@ -140,6 +161,7 @@ fn main() -> Result<()> {
threads_batch,
ctx_size,
verbose,
list_devices,
} = Args::parse();

if verbose {
Expand All @@ -151,8 +173,26 @@ fn main() -> Result<()> {
// init LLM
let backend = LlamaBackend::init()?;

if list_devices {
let devices = llama_cpp_2::list_llama_ggml_backend_devices();
for (i, dev) in devices.iter().enumerate() {
println!("Device {i:>2}: {}", dev.name);
println!(" Description: {}", dev.description);
println!(" Device Type: {:?}", dev.device_type);
println!(" Backend: {}", dev.backend);
println!(
" Memory total: {:?} MiB",
dev.memory_total / 1024 / 1024
);
println!(
" Memory free: {:?} MiB",
dev.memory_free / 1024 / 1024
);
}
}

// offload all layers to the gpu
let model_params = {
let mut model_params = {
#[cfg(any(feature = "cuda", feature = "vulkan"))]
if !disable_gpu {
LlamaModelParams::default().with_n_gpu_layers(1000)
Expand All @@ -163,6 +203,19 @@ fn main() -> Result<()> {
LlamaModelParams::default()
};

if let Some(devices) = devices {
model_params = model_params
.with_devices(&devices)
.with_context(|| "invalid device index in --devices")?;
if main_gpu.is_some() {
eprintln!("warning: --devices overrides --main-gpu");
}
} else if let Some(main_gpu) = main_gpu {
model_params = model_params.with_main_gpu(main_gpu);
// Enable single GPU mode
model_params = model_params.with_split_mode(LlamaSplitMode::None);
}

let prompt = if let Some(str) = prompt {
if file.is_some() {
bail!("either prompt or file must be specified, but not both")
Expand Down
103 changes: 103 additions & 0 deletions llama-cpp-2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ pub enum LLamaCppError {
#[error(transparent)]
EmbeddingError(#[from] EmbeddingsError),
// See [`LlamaSamplerError`]
/// Backend device not found
#[error("Backend device {0} not found")]
BackendDeviceNotFound(usize),
/// Max devices exceeded
#[error("Max devices exceeded. Max devices is {0}")]
MaxDevicesExceeded(usize),
}

/// There was an error while getting the chat template from a model.
Expand Down Expand Up @@ -349,6 +355,103 @@ pub fn llama_supports_mlock() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}

/// Backend device type
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LlamaBackendDeviceType {
/// CPU device
Cpu,
/// ACCEL device
Accelerator,
/// GPU device
Gpu,
/// iGPU device
IntegratedGpu,
/// Unknown device type
Unknown,
}

/// A ggml backend device
///
/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
#[derive(Debug, Clone)]
pub struct LlamaBackendDevice {
/// The index of the device
///
/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
pub index: usize,
/// The name of the device (e.g. "Vulkan0")
pub name: String,
/// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
pub description: String,
/// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
pub backend: String,
/// Total memory of the device in bytes
pub memory_total: usize,
/// Free memory of the device in bytes
pub memory_free: usize,
/// Device type
pub device_type: LlamaBackendDeviceType,
}

/// List ggml backend devices
#[must_use]
pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
let mut devices = Vec::new();
for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
unsafe {
let dev = llama_cpp_sys_2::ggml_backend_dev_get(i);
let mut props = std::mem::zeroed();
llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
let name = props.name;
let name = if name.is_null() {
String::new()
} else {
std::ffi::CStr::from_ptr(name).to_string_lossy().to_string()
};
let description = props.description;
let description = if description.is_null() {
String::new()
} else {
std::ffi::CStr::from_ptr(description)
.to_string_lossy()
.to_string()
};
let backend = llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev);
let backend_name = llama_cpp_sys_2::ggml_backend_reg_name(backend);
let backend = if backend_name.is_null() {
String::new()
} else {
std::ffi::CStr::from_ptr(backend_name)
.to_string_lossy()
.to_string()
};
let memory_total = props.memory_total;
let memory_free = props.memory_free;
let device_type = match props.type_ {
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Gpu,
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => {
LlamaBackendDeviceType::Accelerator
}
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => {
LlamaBackendDeviceType::IntegratedGpu
}
_ => LlamaBackendDeviceType::Unknown,
};
devices.push(LlamaBackendDevice {
index: i,
name,
description,
backend,
memory_total,
memory_free,
device_type,
});
}
}
devices
}

/// Options to configure how llama.cpp logs are intercepted.
#[derive(Default, Debug, Clone)]
pub struct LogOptions {
Expand Down
Loading