-
Notifications
You must be signed in to change notification settings - Fork 139
/
elastic_job.yaml
34 lines (34 loc) · 1.08 KB
/
elastic_job.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
apiVersion: elastic.iml.github.io/v1alpha1
kind: ElasticJob
metadata:
name: torch-mnist
namespace: dlrover
spec:
distributionStrategy: AllreduceStrategy
optimizeMode: single-job
replicaSpecs:
worker:
replicas: 4
template:
spec:
restartPolicy: Always
containers:
- name: main
# yamllint disable-line rule:line-length
image: registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:torch201-mnist
imagePullPolicy: Always
command:
- /bin/bash
- -c
- "dlrover-run --network-check --nnodes=$WORKER_NUM \
--nproc_per_node=2 --max_restarts=3 \
examples/pytorch/mnist/cnn_train.py --num_epochs 2 \
--training_data /data/mnist_png/training/ \
--validation_data /data/mnist_png/testing/"
resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi