Skip to content

Commit 2ccf412

Browse files
authored
Support AWS-neuron device and device-core allocation (#1238)
Implement aws-neuron device sharing and topology-awareness Signed-off-by: limengxuan <mengxuan.li@dynamia.ai>
1 parent 3df4666 commit 2ccf412

File tree

12 files changed

+1501
-3
lines changed

12 files changed

+1501
-3
lines changed

charts/hami/templates/scheduler/configmap.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ data:
5757
},
5858
{{- end }}
5959
{{- end }}
60+
{{- range .Values.devices.awsneuron.customresources }}
61+
{
62+
"name": "{{ . }}",
63+
"ignoredByScheduler": true
64+
},
65+
{{- end }}
6066
{
6167
"name": "{{ .Values.resourceName }}",
6268
"ignoredByScheduler": true

charts/hami/templates/scheduler/configmapnew.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,8 @@ data:
8282
ignoredByScheduler: true
8383
{{- end }}
8484
{{- end }}
85+
{{- range .Values.devices.awsneuron.customresources }}
86+
- name: {{ . }}
87+
ignoredByScheduler: true
88+
{{- end }}
8589
{{- end }}

charts/hami/templates/scheduler/device-configmap.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ data:
111111
resourceCoreName: {{ .Values.iluvatarResourceCore }}
112112
kunlun:
113113
resourceCountName: {{ .Values.kunlunResourceName }}
114+
awsneuron:
115+
resourceCountName: "aws.amazon.com/neuron"
116+
resourceCoreName: "aws.amazon.com/neuroncore"
114117
vnpus:
115118
- chipName: 910B
116119
commonWord: Ascend910A

charts/hami/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,10 @@ devicePlugin:
335335
# memory: 100Mi
336336

337337
devices:
338+
awsneuron:
339+
customresources:
340+
- aws.amazon.com/neuron
341+
- aws.amazon.com/neuroncore
338342
kunlun:
339343
enabled: true
340344
customresources:

cmd/scheduler/metrics.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
221221
ctrvGPUdeviceAllocatedMemoryDesc,
222222
prometheus.GaugeValue,
223223
float64(ctrdevval.Usedmem)*float64(1024)*float64(1024),
224-
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID, fmt.Sprint(ctrdevval.Usedcores))
224+
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
225225
ch <- prometheus.MustNewConstMetric(
226226
ctrvGPUdeviceAllocatedCoreDesc,
227227
prometheus.GaugeValue,
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: npod
5+
spec:
6+
restartPolicy: Never
7+
containers:
8+
- name: npod
9+
command: ["sleep","infinity"]
10+
image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04
11+
resources:
12+
limits:
13+
cpu: "4"
14+
memory: 4Gi
15+
aws.amazon.com/neuroncore: 1
16+
requests:
17+
cpu: "1"
18+
memory: 1Gi
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: nuropod
5+
spec:
6+
restartPolicy: Never
7+
containers:
8+
- name: nuropod
9+
command: ["sleep","infinity"]
10+
image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04
11+
resources:
12+
limits:
13+
cpu: "4"
14+
memory: 4Gi
15+
aws.amazon.com/neuron: 1
16+
requests:
17+
cpu: "1"
18+
memory: 1Gi
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: npod
5+
spec:
6+
restartPolicy: Never
7+
containers:
8+
- name: npod1
9+
command: ["sleep","infinity"]
10+
image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04
11+
resources:
12+
limits:
13+
cpu: "4"
14+
memory: 4Gi
15+
aws.amazon.com/neuroncore: 1
16+
requests:
17+
cpu: "1"
18+
- name: npod
19+
command: ["sleep","infinity"]
20+
image: public.ecr.aws/neuron/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.20.2-ubuntu20.04
21+
resources:
22+
limits:
23+
cpu: "4"
24+
memory: 4Gi
25+
aws.amazon.com/neuroncore: 1
26+
requests:
27+
cpu: "1"
28+
memory: 1Gi

0 commit comments

Comments
 (0)