Skip to content

Commit e5e7a2e

Browse files
authored
add fallback mechanism in case allocator init fails (#125)
we currently use besteffort allocation policy for default allocation strategy. for some reason if the allocator initialization fails, we need to fall back to kubelet/k8s default behaviour
1 parent 92134bf commit e5e7a2e

File tree

2 files changed

+59
-51
lines changed

2 files changed

+59
-51
lines changed

internal/pkg/allocator/besteffort_policy.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ func (b *BestEffortPolicy) getDevicesFromIds(ids []string) []*Device {
6969
// Init initializes pair wise weights of all devices and stores in-memory
7070
func (b *BestEffortPolicy) Init(devs []*Device, topoDir string) error {
7171
err := fetchAllPairWeights(devs, b.p2pWeights, topoDir)
72+
if len(b.p2pWeights) == 0 {
73+
return fmt.Errorf("Besteffort Policy init failed to initialize p2pWeights")
74+
}
7275
if err == nil {
7376
b.devices = devs
7477
for idx := range devs {

internal/pkg/plugin/plugin.go

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ import (
3939

4040
// Plugin is identical to DevicePluginServer interface of device plugin API.
4141
type AMDGPUPlugin struct {
42-
AMDGPUs map[string]map[string]interface{}
43-
Heartbeat chan bool
44-
signal chan os.Signal
45-
Resource string
46-
devAllocator allocator.Policy
42+
AMDGPUs map[string]map[string]interface{}
43+
Heartbeat chan bool
44+
signal chan os.Signal
45+
Resource string
46+
devAllocator allocator.Policy
47+
allocatorInitError bool
4748
}
4849

4950
type AMDGPUPluginOption func(*AMDGPUPlugin)
@@ -83,7 +84,8 @@ func (p *AMDGPUPlugin) Start() error {
8384
signal.Notify(p.signal, syscall.SIGINT, syscall.SIGQUIT, syscall.SIGTERM)
8485
err := p.devAllocator.Init(getDevices(), "")
8586
if err != nil {
86-
glog.Fatalf("allocator init failed with error %v. Exiting...", err)
87+
glog.Errorf("allocator init failed. Falling back to kubelet default allocation. Error %v", err)
88+
p.allocatorInitError = true
8789
}
8890
return nil
8991
}
@@ -157,55 +159,58 @@ func countGPUDevFromTopology(topoRootParam ...string) int {
157159
}
158160

159161
func simpleHealthCheck() bool {
160-
entries, err := filepath.Glob("/sys/class/kfd/kfd/topology/nodes/*/properties")
161-
if err != nil {
162-
glog.Errorf("Error finding properties files: %v", err)
163-
return false
164-
}
165-
166-
for _, propFile := range entries {
167-
f, err := os.Open(propFile)
168-
if err != nil {
169-
glog.Errorf("Error opening %s: %v", propFile, err)
170-
continue
171-
}
172-
defer f.Close()
173-
174-
var cpuCores, gfxVersion int
175-
scanner := bufio.NewScanner(f)
176-
for scanner.Scan() {
177-
line := scanner.Text()
178-
if strings.HasPrefix(line, "cpu_cores_count") {
179-
parts := strings.Fields(line)
180-
if len(parts) == 2 {
181-
cpuCores, _ = strconv.Atoi(parts[1])
182-
}
183-
} else if strings.HasPrefix(line, "gfx_target_version") {
184-
parts := strings.Fields(line)
185-
if len(parts) == 2 {
186-
gfxVersion, _ = strconv.Atoi(parts[1])
187-
}
188-
}
189-
}
190-
191-
if err := scanner.Err(); err != nil {
192-
glog.Warningf("Error scanning %s: %v", propFile, err)
193-
continue
194-
}
195-
196-
if cpuCores == 0 && gfxVersion > 0 {
197-
// Found a GPU
198-
return true
199-
}
200-
}
201-
202-
glog.Warning("No GPU nodes found via properties")
203-
return false
162+
entries, err := filepath.Glob("/sys/class/kfd/kfd/topology/nodes/*/properties")
163+
if err != nil {
164+
glog.Errorf("Error finding properties files: %v", err)
165+
return false
166+
}
167+
168+
for _, propFile := range entries {
169+
f, err := os.Open(propFile)
170+
if err != nil {
171+
glog.Errorf("Error opening %s: %v", propFile, err)
172+
continue
173+
}
174+
defer f.Close()
175+
176+
var cpuCores, gfxVersion int
177+
scanner := bufio.NewScanner(f)
178+
for scanner.Scan() {
179+
line := scanner.Text()
180+
if strings.HasPrefix(line, "cpu_cores_count") {
181+
parts := strings.Fields(line)
182+
if len(parts) == 2 {
183+
cpuCores, _ = strconv.Atoi(parts[1])
184+
}
185+
} else if strings.HasPrefix(line, "gfx_target_version") {
186+
parts := strings.Fields(line)
187+
if len(parts) == 2 {
188+
gfxVersion, _ = strconv.Atoi(parts[1])
189+
}
190+
}
191+
}
192+
193+
if err := scanner.Err(); err != nil {
194+
glog.Warningf("Error scanning %s: %v", propFile, err)
195+
continue
196+
}
197+
198+
if cpuCores == 0 && gfxVersion > 0 {
199+
// Found a GPU
200+
return true
201+
}
202+
}
203+
204+
glog.Warning("No GPU nodes found via properties")
205+
return false
204206
}
205207

206208
// GetDevicePluginOptions returns options to be communicated with Device
207209
// Manager
208210
func (p *AMDGPUPlugin) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
211+
if p.allocatorInitError {
212+
return &pluginapi.DevicePluginOptions{}, nil
213+
}
209214
return &pluginapi.DevicePluginOptions{
210215
GetPreferredAllocationAvailable: true,
211216
}, nil
@@ -430,4 +435,4 @@ func (l *AMDGPULister) NewPlugin(resourceLastName string) dpm.PluginInterface {
430435
WithAllocator(allocator.NewBestEffortPolicy()),
431436
}
432437
return NewAMDGPUPlugin(options...)
433-
}
438+
}

0 commit comments

Comments
 (0)