--- apiVersion: elastic.iml.github.io/v1alpha1 kind: ElasticJob metadata: name: deepctr-auto-scale namespace: dlrover spec: distributionStrategy: ParameterServerStrategy resourceLimits: cpu: "15" memory: "30000Mi" replicaSpecs: ps: autoScale: True template: spec: restartPolicy: Never containers: - name: main # yamllint disable-line rule:line-length image: registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:deeprec_criteo_v1 imagePullPolicy: Always command: - /bin/bash - -c - "cd ./examples/tensorflow/criteo_deeprec \ && python -m dlrover.trainer.entry.local_entry \ --platform=Kubernetes --conf=train_conf.TrainConf \ --enable_auto_scaling=True" volumeMounts: - name: pvc-nas mountPath: /nas volumes: - name: pvc-nas persistentVolumeClaim: claimName: pvc-nas worker: autoScale: True template: spec: restartPolicy: Never containers: - name: main # yamllint disable-line rule:line-length image: registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:deeprec_criteo_v1 imagePullPolicy: Always command: - /bin/bash - -c - "cd ./examples/tensorflow/criteo_deeprec \ && python -m dlrover.trainer.entry.local_entry \ --platform=Kubernetes --conf=train_conf.TrainConf \ --enable_auto_scaling=True" volumeMounts: - name: pvc-nas mountPath: /nas volumes: - name: pvc-nas persistentVolumeClaim: claimName: pvc-nas evaluator: autoScale: True replicas: 1 template: spec: restartPolicy: Never containers: - name: main # yamllint disable-line rule:line-length image: registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:deeprec_criteo_v1 imagePullPolicy: Always command: - /bin/bash - -c - "cd ./examples/tensorflow/criteo_deeprec \ && python -m dlrover.trainer.entry.local_entry \ --platform=Kubernetes --conf=train_conf.TrainConf \ --enable_auto_scaling=True" volumeMounts: - name: pvc-nas mountPath: /nas volumes: - name: pvc-nas persistentVolumeClaim: claimName: pvc-nas dlrover-master: template: spec: restartPolicy: Never containers: - name: main imagePullPolicy: Always