Skip to content

Commit a56b1b0

Browse files
feat: enhance logging and improve device allocation handling in NVIDIA plugin
1 parent 5b638dd commit a56b1b0

File tree

3 files changed

+30
-9
lines changed

3 files changed

+30
-9
lines changed

pkg/nvidia-plugin/pkg/plugin/register.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,10 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo {
141141
if plugin.schedulerConfig.DeviceMemoryScaling != 1 {
142142
registeredMem = int32(float64(registeredMem) * plugin.schedulerConfig.DeviceMemoryScaling)
143143
}
144-
klog.Infoln("MemoryScaling=", plugin.schedulerConfig.DeviceMemoryScaling, "registeredMem=", registeredMem)
144+
klog.V(3).InfoS("Applied memory scaling",
145+
"scalingFactor", plugin.schedulerConfig.DeviceMemoryScaling,
146+
"originalMemoryMB", memoryTotal/1024/1024,
147+
"registeredMemoryMB", registeredMem)
145148
health := true
146149
for _, val := range devs {
147150
if strings.Compare(val.ID, UUID) == 0 {
@@ -170,7 +173,12 @@ func (plugin *NvidiaDevicePlugin) getAPIDevices() *[]*util.DeviceInfo {
170173
Mode: plugin.operatingMode,
171174
Health: health,
172175
})
173-
klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, registeredMem, Model, numa)
176+
klog.V(4).InfoS("Registered NVIDIA device",
177+
"uuid", UUID,
178+
"deviceIndex", idx,
179+
"memoryMB", registeredMem,
180+
"model", Model,
181+
"numaNode", numa)
174182
}
175183
return &res
176184
}

pkg/nvidia-plugin/pkg/plugin/server.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,8 +410,8 @@ func (plugin *NvidiaDevicePlugin) GetPreferredAllocation(ctx context.Context, r
410410

411411
// Allocate which return list of devices.
412412
func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
413+
klog.InfoS("Allocate", "request", reqs)
413414
responses := pluginapi.AllocateResponse{}
414-
415415
nodeName := os.Getenv(util.NodeNameEnvName)
416416
current, err := util.GetPendingPod(ctx, nodeName)
417417
if err != nil {
@@ -423,7 +423,7 @@ func (plugin *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.
423423
return nil, fmt.Errorf("invalid allocation request for %q: %w", plugin.rm.Resource(), err)
424424
}
425425
currentCtr, devreq, err := GetNextDeviceRequest(nvidia.NvidiaGPUDevice, *current)
426-
klog.Infoln("deviceAllocateFromAnnotation=", devreq)
426+
klog.Infof("allocate req:%v, current:%v, devreq:%v", req, current, devreq)
427427
if err != nil {
428428
device.PodAllocationFailed(nodeName, current, NodeLockNvidia)
429429
return &responses, err
@@ -659,7 +659,7 @@ func (plugin *NvidiaDevicePlugin) deviceIDsFromAnnotatedDeviceIDs(ids []string)
659659
}
660660

661661
func (plugin *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
662-
return plugin.rm.Devices().GetPluginDevices()
662+
return plugin.rm.Devices().GetPluginDevices(plugin.schedulerConfig.DeviceSplitCount)
663663
}
664664

665665
// updateResponseForDeviceListEnvVar sets the environment variable for the requested devices.

pkg/nvidia-plugin/pkg/rm/devices.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,24 @@ func (ds Devices) GetUUIDs() []string {
175175
}
176176

177177
// GetPluginDevices returns the plugin Devices from all devices in the Devices
178-
func (ds Devices) GetPluginDevices() []*pluginapi.Device {
178+
func (ds Devices) GetPluginDevices(count uint) []*pluginapi.Device {
179179
var res []*pluginapi.Device
180-
for _, device := range ds {
181-
d := device
182-
res = append(res, &d.Device)
180+
if !strings.Contains(ds.GetIDs()[0], "MIG") {
181+
for _, dev := range ds {
182+
for i := uint(0); i < count; i++ {
183+
id := fmt.Sprintf("%v-%v", dev.ID, i)
184+
res = append(res, &pluginapi.Device{
185+
ID: id,
186+
Health: dev.Health,
187+
Topology: nil,
188+
})
189+
}
190+
}
191+
} else {
192+
for _, device := range ds {
193+
d := device
194+
res = append(res, &d.Device)
195+
}
183196
}
184197
return res
185198
}

0 commit comments

Comments
 (0)