@@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
116
116
opal_hash_table_set_value_ptr (& ompi_proc_hash , & proc -> super .proc_name , sizeof (proc -> super .proc_name ),
117
117
proc );
118
118
119
+ /* by default we consider process to be remote */
120
+ proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
119
121
* procp = proc ;
120
122
121
123
return OMPI_SUCCESS ;
@@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
133
135
*/
134
136
int ompi_proc_complete_init_single (ompi_proc_t * proc )
135
137
{
136
- uint16_t u16 , * u16ptr ;
137
138
int ret ;
138
139
139
- u16ptr = & u16 ;
140
-
141
140
if ((OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> jobid == OMPI_PROC_MY_NAME -> jobid ) &&
142
141
(OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> vpid == OMPI_PROC_MY_NAME -> vpid )) {
143
142
/* nothing else to do */
144
143
return OMPI_SUCCESS ;
145
144
}
146
145
147
- /* get the locality information - all RTEs are required
148
- * to provide this information at startup */
149
- OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
150
- if (OPAL_SUCCESS != ret ) {
151
- proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
152
- } else {
153
- proc -> super .proc_flags = u16 ;
154
- }
155
-
156
146
/* we can retrieve the hostname at no cost because it
157
147
* was provided at startup - but make it optional so
158
148
* we don't chase after it if some system doesn't
@@ -287,20 +277,6 @@ int ompi_proc_init(void)
287
277
}
288
278
#endif
289
279
290
- if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
291
- /* create proc structures and find self */
292
- for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
293
- if (i == OMPI_PROC_MY_NAME -> vpid ) {
294
- continue ;
295
- }
296
-
297
- ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , i , & proc );
298
- if (OMPI_SUCCESS != ret ) {
299
- return ret ;
300
- }
301
- }
302
- }
303
-
304
280
return OMPI_SUCCESS ;
305
281
}
306
282
@@ -329,47 +305,77 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
329
305
*/
330
306
int ompi_proc_complete_init (void )
331
307
{
308
+ opal_process_name_t wildcard_rank ;
332
309
ompi_proc_t * proc ;
333
310
int ret , errcode = OMPI_SUCCESS ;
311
+ char * val ;
334
312
335
313
opal_mutex_lock (& ompi_proc_lock );
336
314
315
+ /* Add all local peers first */
316
+ wildcard_rank .jobid = OMPI_PROC_MY_NAME -> jobid ;
317
+ wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
318
+ /* retrieve the local peers */
319
+ OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
320
+ & wildcard_rank , & val , OPAL_STRING );
321
+ if (OPAL_SUCCESS == ret && NULL != val ) {
322
+ char * * peers = opal_argv_split (val , ',' );
323
+ int i ;
324
+ free (val );
325
+ for (i = 0 ; NULL != peers [i ]; i ++ ) {
326
+ ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
327
+ uint16_t u16 , * u16ptr = & u16 ;
328
+ if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
329
+ continue ;
330
+ }
331
+ ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , local_rank , & proc );
332
+ if (OMPI_SUCCESS != ret ) {
333
+ return ret ;
334
+ }
335
+ /* get the locality information - all RTEs are required
336
+ * to provide this information at startup */
337
+ OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
338
+ if (OPAL_SUCCESS == ret ) {
339
+ proc -> super .proc_flags = u16 ;
340
+ }
341
+ }
342
+ opal_argv_free (peers );
343
+ }
344
+
345
+ /* Complete initialization of node-local procs */
337
346
OPAL_LIST_FOREACH (proc , & ompi_proc_list , ompi_proc_t ) {
338
347
ret = ompi_proc_complete_init_single (proc );
339
348
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
340
349
errcode = ret ;
341
350
break ;
342
351
}
343
352
}
344
- opal_mutex_unlock (& ompi_proc_lock );
345
353
346
- if (ompi_process_info .num_procs >= ompi_add_procs_cutoff ) {
347
- char * val = NULL ;
348
- opal_process_name_t wildcard_rank ;
349
- wildcard_rank .jobid = OMPI_PROC_MY_NAME -> jobid ;
350
- wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
351
- /* retrieve the local peers */
352
- OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
353
- & wildcard_rank , & val , OPAL_STRING );
354
- if (OPAL_SUCCESS == ret && NULL != val ) {
355
- char * * peers = opal_argv_split (val , ',' );
356
- int i ;
357
- free (val );
358
- for (i = 0 ; NULL != peers [i ]; i ++ ) {
359
- ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
360
- opal_process_name_t proc_name = {.vpid = local_rank , .jobid = OMPI_PROC_MY_NAME -> jobid };
361
-
362
- if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
363
- continue ;
364
- }
365
- (void ) ompi_proc_for_name (proc_name );
366
- }
367
- opal_argv_free (peers );
354
+ /* if cutoff is larger than # of procs - add all processes
355
+ * NOTE that local procs will be automatically skipped as they
356
+ * are already in the hash table
357
+ */
358
+ if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
359
+ /* sinse ompi_proc_for_name is locking internally -
360
+ * we need to release lock here
361
+ */
362
+ opal_mutex_unlock (& ompi_proc_lock );
363
+
364
+ for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
365
+ opal_process_name_t proc_name ;
366
+ proc_name .jobid = OMPI_PROC_MY_NAME -> jobid ;
367
+ proc_name .vpid = i ;
368
+ (void ) ompi_proc_for_name (proc_name );
368
369
}
370
+
371
+ /* acquire lock back for the next step - sort */
372
+ opal_mutex_lock (& ompi_proc_lock );
369
373
}
370
374
371
375
opal_list_sort (& ompi_proc_list , ompi_proc_compare_vid );
372
376
377
+ opal_mutex_unlock (& ompi_proc_lock );
378
+
373
379
return errcode ;
374
380
}
375
381
0 commit comments