@@ -39,11 +39,12 @@ import (
3939
4040// Plugin is identical to DevicePluginServer interface of device plugin API.
4141type AMDGPUPlugin struct {
42- AMDGPUs map [string ]map [string ]interface {}
43- Heartbeat chan bool
44- signal chan os.Signal
45- Resource string
46- devAllocator allocator.Policy
42+ AMDGPUs map [string ]map [string ]interface {}
43+ Heartbeat chan bool
44+ signal chan os.Signal
45+ Resource string
46+ devAllocator allocator.Policy
47+ allocatorInitError bool
4748}
4849
4950type AMDGPUPluginOption func (* AMDGPUPlugin )
@@ -83,7 +84,8 @@ func (p *AMDGPUPlugin) Start() error {
8384 signal .Notify (p .signal , syscall .SIGINT , syscall .SIGQUIT , syscall .SIGTERM )
8485 err := p .devAllocator .Init (getDevices (), "" )
8586 if err != nil {
86- glog .Fatalf ("allocator init failed with error %v. Exiting..." , err )
87+ glog .Errorf ("allocator init failed. Falling back to kubelet default allocation. Error %v" , err )
88+ p .allocatorInitError = true
8789 }
8890 return nil
8991}
@@ -157,55 +159,58 @@ func countGPUDevFromTopology(topoRootParam ...string) int {
157159}
158160
159161func simpleHealthCheck () bool {
160- entries , err := filepath .Glob ("/sys/class/kfd/kfd/topology/nodes/*/properties" )
161- if err != nil {
162- glog .Errorf ("Error finding properties files: %v" , err )
163- return false
164- }
165-
166- for _ , propFile := range entries {
167- f , err := os .Open (propFile )
168- if err != nil {
169- glog .Errorf ("Error opening %s: %v" , propFile , err )
170- continue
171- }
172- defer f .Close ()
173-
174- var cpuCores , gfxVersion int
175- scanner := bufio .NewScanner (f )
176- for scanner .Scan () {
177- line := scanner .Text ()
178- if strings .HasPrefix (line , "cpu_cores_count" ) {
179- parts := strings .Fields (line )
180- if len (parts ) == 2 {
181- cpuCores , _ = strconv .Atoi (parts [1 ])
182- }
183- } else if strings .HasPrefix (line , "gfx_target_version" ) {
184- parts := strings .Fields (line )
185- if len (parts ) == 2 {
186- gfxVersion , _ = strconv .Atoi (parts [1 ])
187- }
188- }
189- }
190-
191- if err := scanner .Err (); err != nil {
192- glog .Warningf ("Error scanning %s: %v" , propFile , err )
193- continue
194- }
195-
196- if cpuCores == 0 && gfxVersion > 0 {
197- // Found a GPU
198- return true
199- }
200- }
201-
202- glog .Warning ("No GPU nodes found via properties" )
203- return false
162+ entries , err := filepath .Glob ("/sys/class/kfd/kfd/topology/nodes/*/properties" )
163+ if err != nil {
164+ glog .Errorf ("Error finding properties files: %v" , err )
165+ return false
166+ }
167+
168+ for _ , propFile := range entries {
169+ f , err := os .Open (propFile )
170+ if err != nil {
171+ glog .Errorf ("Error opening %s: %v" , propFile , err )
172+ continue
173+ }
174+ defer f .Close ()
175+
176+ var cpuCores , gfxVersion int
177+ scanner := bufio .NewScanner (f )
178+ for scanner .Scan () {
179+ line := scanner .Text ()
180+ if strings .HasPrefix (line , "cpu_cores_count" ) {
181+ parts := strings .Fields (line )
182+ if len (parts ) == 2 {
183+ cpuCores , _ = strconv .Atoi (parts [1 ])
184+ }
185+ } else if strings .HasPrefix (line , "gfx_target_version" ) {
186+ parts := strings .Fields (line )
187+ if len (parts ) == 2 {
188+ gfxVersion , _ = strconv .Atoi (parts [1 ])
189+ }
190+ }
191+ }
192+
193+ if err := scanner .Err (); err != nil {
194+ glog .Warningf ("Error scanning %s: %v" , propFile , err )
195+ continue
196+ }
197+
198+ if cpuCores == 0 && gfxVersion > 0 {
199+ // Found a GPU
200+ return true
201+ }
202+ }
203+
204+ glog .Warning ("No GPU nodes found via properties" )
205+ return false
204206}
205207
206208// GetDevicePluginOptions returns options to be communicated with Device
207209// Manager
208210func (p * AMDGPUPlugin ) GetDevicePluginOptions (ctx context.Context , e * pluginapi.Empty ) (* pluginapi.DevicePluginOptions , error ) {
211+ if p .allocatorInitError {
212+ return & pluginapi.DevicePluginOptions {}, nil
213+ }
209214 return & pluginapi.DevicePluginOptions {
210215 GetPreferredAllocationAvailable : true ,
211216 }, nil
@@ -430,4 +435,4 @@ func (l *AMDGPULister) NewPlugin(resourceLastName string) dpm.PluginInterface {
430435 WithAllocator (allocator .NewBestEffortPolicy ()),
431436 }
432437 return NewAMDGPUPlugin (options ... )
433- }
438+ }
0 commit comments