@@ -1277,11 +1277,11 @@ func (c *MPIJobController) getConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.Config
12771277// one if it doesn't exist.
12781278func (c * MPIJobController ) getOrCreateConfigMap (mpiJob * kubeflow.MPIJob ) (* corev1.ConfigMap , error ) {
12791279 klog .Infof ("create config called for %s" , getJobKey (mpiJob ))
1280- newCM := newConfigMap (mpiJob , c .workerReplicas (mpiJob ))
12811280 podList , err := c .getRunningWorkerPods (mpiJob )
12821281 if err != nil {
12831282 return nil , err
12841283 }
1284+ newCM := newConfigMap (mpiJob , c .workerReplicas (mpiJob ), podList )
12851285 updateDiscoverHostsInConfigMap (newCM , mpiJob , podList )
12861286
12871287 cm , err := c .configMapLister .ConfigMaps (mpiJob .Namespace ).Get (mpiJob .Name + configSuffix )
@@ -1935,7 +1935,7 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
19351935// newConfigMap creates a new ConfigMap containing configurations for an MPIJob
19361936// resource. It also sets the appropriate OwnerReferences on the resource so
19371937// handleObject can discover the MPIJob resource that 'owns' it.
1938- func newConfigMap (mpiJob * kubeflow.MPIJob , workerReplicas int32 ) * corev1.ConfigMap {
1938+ func newConfigMap (mpiJob * kubeflow.MPIJob , workerReplicas int32 , workerPods [] * corev1. Pod ) * corev1.ConfigMap {
19391939 var buffer bytes.Buffer
19401940 slots := ptr .Deref (mpiJob .Spec .SlotsPerWorker , 1 )
19411941 // note that pod.spec.dnsConfig also affect the svc resolution
@@ -1955,8 +1955,22 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM
19551955 for i := 0 ; i < int (* mpiJob .Spec .MPIReplicaSpecs [kubeflow .MPIReplicaTypeWorker ].MaxReplicas ); i ++ {
19561956 name := workerName (mpiJob , i )
19571957
1958- //buffer.WriteString(fmt.Sprintf("host %s.%s ++cpus %d\n", name, mpiJob.Name, slots))
1959- buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1958+ // Find the corresponding pod for this worker
1959+ var podIP string
1960+ for _ , pod := range workerPods {
1961+ if pod .Name == name && pod .Status .PodIP != "" {
1962+ podIP = pod .Status .PodIP
1963+ break
1964+ }
1965+ }
1966+
1967+ // Use IP address if available, otherwise fall back to DNS name
1968+ if podIP != "" {
1969+ buffer .WriteString (fmt .Sprintf ("%s slots=%d\n " , podIP , slots ))
1970+ } else {
1971+ // Fallback to DNS name if IP is not available
1972+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1973+ }
19601974 /*switch mpiJob.Spec.MPIImplementation {
19611975 case kubeflow.MPIImplementationOpenMPI:
19621976 buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
0 commit comments