utilityai · MarcusDunn · Nov 14, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
@@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
-use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode};
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::sampling::LlamaSampler;
@@ -48,6 +48,23 @@ struct Args {
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[clap(long)]
     disable_gpu: bool,
+    /// Set main GPU device index (default: 0)
+    ///
+    /// By setting this option, multiple GPU is disabled.
+    #[arg(
+        long,
+        help = "Set main GPU device id (default: 0). Disables multi-GPU."
+    )]
+    main_gpu: Option<i32>,
+    /// Set devices to use by index
+    ///
+    /// This option overrides `main-gpu` and enables multi-GPU.
+    #[arg(
+        long,
+        value_delimiter = ',',
+        help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
+    )]
+    devices: Option<Vec<usize>>,
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[arg(long, help = "Keep MoE layers on CPU")]
     cmoe: bool,
@@ -72,6 +89,8 @@ struct Args {
     ctx_size: Option<NonZeroU32>,
     #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
     verbose: bool,
+    #[arg(long, help = "list backend devices")]
+    list_devices: bool,
 }
 
 /// Parse a single key-value pair
@@ -132,6 +151,8 @@ fn main() -> Result<()> {
         file,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         disable_gpu,
+        main_gpu,
+        devices,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         cmoe,
         key_value_overrides,
@@ -140,6 +161,7 @@ fn main() -> Result<()> {
         threads_batch,
         ctx_size,
         verbose,
+        list_devices,
     } = Args::parse();
 
     if verbose {
@@ -151,8 +173,26 @@ fn main() -> Result<()> {
     // init LLM
     let backend = LlamaBackend::init()?;
 
+    if list_devices {
+        let devices = llama_cpp_2::list_llama_ggml_backend_devices();
+        for (i, dev) in devices.iter().enumerate() {
+            println!("Device {i:>2}: {}", dev.name);
+            println!("           Description: {}", dev.description);
+            println!("           Device Type: {:?}", dev.device_type);
+            println!("           Backend: {}", dev.backend);
+            println!(
+                "           Memory total: {:?} MiB",
+                dev.memory_total / 1024 / 1024
+            );
+            println!(
+                "           Memory free:  {:?} MiB",
+                dev.memory_free / 1024 / 1024
+            );
+        }
+    }
+
     // offload all layers to the gpu
-    let model_params = {
+    let mut model_params = {
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
@@ -163,6 +203,19 @@ fn main() -> Result<()> {
         LlamaModelParams::default()
     };
 
+    if let Some(devices) = devices {
+        model_params = model_params
+            .with_devices(&devices)
+            .with_context(|| "invalid device index in --devices")?;
+        if main_gpu.is_some() {
+            eprintln!("warning: --devices overrides --main-gpu");
+        }
+    } else if let Some(main_gpu) = main_gpu {
+        model_params = model_params.with_main_gpu(main_gpu);
+        // Enable single GPU mode
+        model_params = model_params.with_split_mode(LlamaSplitMode::None);
+    }
+
     let prompt = if let Some(str) = prompt {
         if file.is_some() {
             bail!("either prompt or file must be specified, but not both")

diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -66,6 +66,12 @@ pub enum LLamaCppError {
     #[error(transparent)]
     EmbeddingError(#[from] EmbeddingsError),
     // See [`LlamaSamplerError`]
+    /// Backend device not found
+    #[error("Backend device {0} not found")]
+    BackendDeviceNotFound(usize),
+    /// Max devices exceeded
+    #[error("Max devices exceeded. Max devices is {0}")]
+    MaxDevicesExceeded(usize),
 }
 
 /// There was an error while getting the chat template from a model.
@@ -349,6 +355,103 @@ pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
 
+/// Backend device type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LlamaBackendDeviceType {
+    /// CPU device
+    Cpu,
+    /// ACCEL device
+    Accelerator,
+    /// GPU device
+    Gpu,
+    /// iGPU device
+    IntegratedGpu,
+    /// Unknown device type
+    Unknown,
+}
+
+/// A ggml backend device
+///
+/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+#[derive(Debug, Clone)]
+pub struct LlamaBackendDevice {
+    /// The index of the device
+    ///
+    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+    pub index: usize,
+    /// The name of the device (e.g. "Vulkan0")
+    pub name: String,
+    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
+    pub description: String,
+    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
+    pub backend: String,
+    /// Total memory of the device in bytes
+    pub memory_total: usize,
+    /// Free memory of the device in bytes
+    pub memory_free: usize,
+    /// Device type
+    pub device_type: LlamaBackendDeviceType,
+}
+
+/// List ggml backend devices
+#[must_use]
+pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
+    let mut devices = Vec::new();
+    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
+        unsafe {
+            let dev = llama_cpp_sys_2::ggml_backend_dev_get(i);
+            let mut props = std::mem::zeroed();
+            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
+            let name = props.name;
+            let name = if name.is_null() {
+                String::new()
+            } else {
+                std::ffi::CStr::from_ptr(name).to_string_lossy().to_string()
+            };
+            let description = props.description;
+            let description = if description.is_null() {
+                String::new()
+            } else {
+                std::ffi::CStr::from_ptr(description)
+                    .to_string_lossy()
+                    .to_string()
+            };
+            let backend = llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev);
+            let backend_name = llama_cpp_sys_2::ggml_backend_reg_name(backend);
+            let backend = if backend_name.is_null() {
+                String::new()
+            } else {
+                std::ffi::CStr::from_ptr(backend_name)
+                    .to_string_lossy()
+                    .to_string()
+            };
+            let memory_total = props.memory_total;
+            let memory_free = props.memory_free;
+            let device_type = match props.type_ {
+                llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Gpu,
+                llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => {
+                    LlamaBackendDeviceType::Accelerator
+                }
+                llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
+                llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => {
+                    LlamaBackendDeviceType::IntegratedGpu
+                }
+                _ => LlamaBackendDeviceType::Unknown,
+            };
+            devices.push(LlamaBackendDevice {
+                index: i,
+                name,
+                description,
+                backend,
+                memory_total,
+                memory_free,
+                device_type,
+            });
+        }
+    }
+    devices
+}
+
 /// Options to configure how llama.cpp logs are intercepted.
 #[derive(Default, Debug, Clone)]
 pub struct LogOptions {