Skip to content

Commit

Permalink
Set master replicas in the replicas of a job
Browse files Browse the repository at this point in the history
  • Loading branch information
workingloong committed Jan 29, 2023
1 parent 7ca4db2 commit 3a4c21e
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 13 deletions.
3 changes: 0 additions & 3 deletions dlrover/go/operator/api/v1alpha1/elasticjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ type ElasticJobSpec struct {

// Envs specifies environment variables for Pods of the job.
Envs map[string]*corev1.EnvVar `json:"envs,omitempty"`

// DlroverMaster specifies the specification of the DLRover master.
DlroverMaster *corev1.PodTemplateSpec `json:"dlroverMaster"`
}

// ReplicaSpec specifies the number and resources of replica.
Expand Down
2 changes: 1 addition & 1 deletion dlrover/go/operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func main() {
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.StringVar(&masterImage, "master-image", "registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:latest ",
flag.StringVar(&masterImage, "master-image", "registry.cn-hangzhou.aliyuncs.com/intell-ai/dlrover:latest",
"The image to launch a dlrover master Pod of an ElasticJob.")
opts := zap.Options{
Development: true,
Expand Down
14 changes: 10 additions & 4 deletions dlrover/go/operator/pkg/controllers/master/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ func (m *Manager) newJobMaster(
job *elasticv1alpha1.ElasticJob, replicaIndex int,
) *corev1.Pod {
masterName := newJobMasterName(job.Name)
pod := common.NewPod(job, job.Spec.DlroverMaster, masterName)
pod := common.NewPod(
job, &job.Spec.ReplicaSpecs[ReplicaTypeTrainerMaster].Template, masterName,
)
pod.Labels[common.LabelReplicaTypeKey] = string(ReplicaTypeTrainerMaster)
pod.Labels[common.LabelReplicaIndexKey] = fmt.Sprintf("%d", replicaIndex)
return pod
Expand Down Expand Up @@ -238,14 +240,18 @@ func NewMasterTemplateToJob(job *elasticv1alpha1.ElasticJob, masterImage string)
RestartPolicy: corev1.RestartPolicyNever,
},
}
if job.Spec.DlroverMaster != nil {
mainContainer := job.Spec.DlroverMaster.Spec.Containers[0]
if _, ok := job.Spec.ReplicaSpecs[ReplicaTypeTrainerMaster]; ok {
mainContainer := job.Spec.ReplicaSpecs[ReplicaTypeTrainerMaster].ReplicaSpec.Template.Spec.Containers[0]
if mainContainer.Image != "" {
podTemplate.Spec.Containers[0].Image = mainContainer.Image
}
if mainContainer.ImagePullPolicy != "" {
podTemplate.Spec.Containers[0].ImagePullPolicy = mainContainer.ImagePullPolicy
}
}
job.Spec.DlroverMaster = podTemplate
job.Spec.ReplicaSpecs[ReplicaTypeTrainerMaster] = &elasticv1alpha1.ReplicaSpec{
ReplicaSpec: commonv1.ReplicaSpec{
Template: *podTemplate,
},
}
}
16 changes: 12 additions & 4 deletions dlrover/go/operator/pkg/controllers/master/master_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package master

import (
elasticv1alpha1 "github.com/intelligent-machine-learning/easydl/dlrover/go/operator/api/v1alpha1"
commonv1 "github.com/intelligent-machine-learning/easydl/dlrover/go/operator/pkg/common/api/v1"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -31,6 +32,7 @@ func TestCreateMasterPod(t *testing.T) {
Labels: map[string]string{},
},
}
job.Spec.ReplicaSpecs = make(map[commonv1.ReplicaType]*elasticv1alpha1.ReplicaSpec)
NewMasterTemplateToJob(job, "dlrover-master:test")
manager := &Manager{}
pod := manager.newJobMaster(job, initMasterIndex)
Expand All @@ -56,12 +58,18 @@ func TestCreateMasterPodWithImage(t *testing.T) {
Image: "dlrover-master:test-v0",
ImagePullPolicy: "Always",
}
job.Spec.DlroverMaster = &corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{container},
RestartPolicy: corev1.RestartPolicyNever,
job.Spec.ReplicaSpecs = make(map[commonv1.ReplicaType]*elasticv1alpha1.ReplicaSpec)
job.Spec.ReplicaSpecs[ReplicaTypeTrainerMaster] = &elasticv1alpha1.ReplicaSpec{
ReplicaSpec: commonv1.ReplicaSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{container},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}

NewMasterTemplateToJob(job, "dlrover-master:test")
manager := &Manager{}
pod := manager.newJobMaster(job, initMasterIndex)
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ make -f dlrover/Makefile
# Create dlrover package
echo "Building the wheel for dlrover."
rm -rf ./build/lib
python setup.py --quiet bdist_wheel --dist-dir ./build
python setup.py --quiet bdist_wheel

0 comments on commit 3a4c21e

Please sign in to comment.