Skip to content

Commit 45b46dc

Browse files
author
Ralph Castain
authored
Merge pull request #3181 from artpol84/add_proc_fix_2/master
ompi: Avoid unnecessary PMIx lookups when adding procs (step 2).
2 parents 5219054 + 1f7a3a2 commit 45b46dc

File tree

1 file changed

+55
-49
lines changed

1 file changed

+55
-49
lines changed

ompi/proc/proc.c

Lines changed: 55 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
116116
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
117117
proc);
118118

119+
/* by default we consider process to be remote */
120+
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
119121
*procp = proc;
120122

121123
return OMPI_SUCCESS;
@@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
133135
*/
134136
int ompi_proc_complete_init_single (ompi_proc_t *proc)
135137
{
136-
uint16_t u16, *u16ptr;
137138
int ret;
138139

139-
u16ptr = &u16;
140-
141140
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
142141
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
143142
/* nothing else to do */
144143
return OMPI_SUCCESS;
145144
}
146145

147-
/* get the locality information - all RTEs are required
148-
* to provide this information at startup */
149-
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
150-
if (OPAL_SUCCESS != ret) {
151-
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
152-
} else {
153-
proc->super.proc_flags = u16;
154-
}
155-
156146
/* we can retrieve the hostname at no cost because it
157147
* was provided at startup - but make it optional so
158148
* we don't chase after it if some system doesn't
@@ -287,20 +277,6 @@ int ompi_proc_init(void)
287277
}
288278
#endif
289279

290-
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
291-
/* create proc structures and find self */
292-
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
293-
if (i == OMPI_PROC_MY_NAME->vpid) {
294-
continue;
295-
}
296-
297-
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
298-
if (OMPI_SUCCESS != ret) {
299-
return ret;
300-
}
301-
}
302-
}
303-
304280
return OMPI_SUCCESS;
305281
}
306282

@@ -329,47 +305,77 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
329305
*/
330306
int ompi_proc_complete_init(void)
331307
{
308+
opal_process_name_t wildcard_rank;
332309
ompi_proc_t *proc;
333310
int ret, errcode = OMPI_SUCCESS;
311+
char *val;
334312

335313
opal_mutex_lock (&ompi_proc_lock);
336314

315+
/* Add all local peers first */
316+
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
317+
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
318+
/* retrieve the local peers */
319+
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
320+
&wildcard_rank, &val, OPAL_STRING);
321+
if (OPAL_SUCCESS == ret && NULL != val) {
322+
char **peers = opal_argv_split(val, ',');
323+
int i;
324+
free(val);
325+
for (i=0; NULL != peers[i]; i++) {
326+
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
327+
uint16_t u16, *u16ptr = &u16;
328+
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
329+
continue;
330+
}
331+
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc);
332+
if (OMPI_SUCCESS != ret) {
333+
return ret;
334+
}
335+
/* get the locality information - all RTEs are required
336+
* to provide this information at startup */
337+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
338+
if (OPAL_SUCCESS == ret) {
339+
proc->super.proc_flags = u16;
340+
}
341+
}
342+
opal_argv_free(peers);
343+
}
344+
345+
/* Complete initialization of node-local procs */
337346
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
338347
ret = ompi_proc_complete_init_single (proc);
339348
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
340349
errcode = ret;
341350
break;
342351
}
343352
}
344-
opal_mutex_unlock (&ompi_proc_lock);
345353

346-
if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
347-
char *val = NULL;
348-
opal_process_name_t wildcard_rank;
349-
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
350-
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
351-
/* retrieve the local peers */
352-
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
353-
&wildcard_rank, &val, OPAL_STRING);
354-
if (OPAL_SUCCESS == ret && NULL != val) {
355-
char **peers = opal_argv_split(val, ',');
356-
int i;
357-
free(val);
358-
for (i=0; NULL != peers[i]; i++) {
359-
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
360-
opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid};
361-
362-
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
363-
continue;
364-
}
365-
(void) ompi_proc_for_name (proc_name);
366-
}
367-
opal_argv_free(peers);
354+
/* if cutoff is larger than # of procs - add all processes
355+
* NOTE that local procs will be automatically skipped as they
356+
* are already in the hash table
357+
*/
358+
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
359+
/* sinse ompi_proc_for_name is locking internally -
360+
* we need to release lock here
361+
*/
362+
opal_mutex_unlock (&ompi_proc_lock);
363+
364+
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
365+
opal_process_name_t proc_name;
366+
proc_name.jobid = OMPI_PROC_MY_NAME->jobid;
367+
proc_name.vpid = i;
368+
(void) ompi_proc_for_name (proc_name);
368369
}
370+
371+
/* acquire lock back for the next step - sort */
372+
opal_mutex_lock (&ompi_proc_lock);
369373
}
370374

371375
opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
372376

377+
opal_mutex_unlock (&ompi_proc_lock);
378+
373379
return errcode;
374380
}
375381

0 commit comments

Comments
 (0)