Skip to content

Commit 5a1dbce

Browse files
committed
topo/treematch: fix topo_treematch_distgraph_create
Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent aee9b9b commit 5a1dbce

File tree

1 file changed

+68
-82
lines changed

1 file changed

+68
-82
lines changed

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 68 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,16 @@
5454
#define MY_STRING_SIZE 64
5555
/*#define __DEBUG__ 1 */
5656

57-
57+
/**
58+
* This function is a allreduce between all processes to detect for oversubscription.
59+
* On each node, the local_procs will be a different array, that contains only the
60+
* local processes. Thus, that process will compute the node oversubscription and will
61+
* bring this value to the operation, while every other process on the node will
62+
* contribute 0.
63+
* Doing an AllReduce might be an overkill for this situation, but it should remain
64+
* more scalable than a star reduction (between the roots of each node (nodes_roots),
65+
* followed by a bcast to all processes.
66+
*/
5867
static int check_oversubscribing(int rank,
5968
int num_nodes,
6069
int num_objs_in_node,
@@ -63,52 +72,16 @@ static int check_oversubscribing(int rank,
6372
int *local_procs,
6473
ompi_communicator_t *comm_old)
6574
{
66-
int oversubscribed = 0;
67-
int local_oversub = 0;
68-
int err;
75+
int oversubscribed = 0, local_oversub = 0, err;
6976

77+
/* Only a single process per node, the local root, compute the oversubscription condition */
7078
if (rank == local_procs[0])
7179
if(num_objs_in_node < num_procs_in_node)
7280
local_oversub = 1;
7381

74-
if (rank == 0) {
75-
MPI_Request *reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
76-
int *oversub = (int *)calloc(num_nodes, sizeof(int));
77-
int i;
78-
79-
oversub[0] = local_oversub;
80-
for(i = 1; i < num_nodes; i++)
81-
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&oversub[i], 1, MPI_INT,
82-
nodes_roots[i], 111, comm_old, &reqs[i-1])))) {
83-
/* NTH: more needs to be done to correctly clean up here */
84-
free (reqs);
85-
free (oversub);
86-
return err;
87-
}
88-
89-
if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes-1,
90-
reqs, MPI_STATUSES_IGNORE))) {
91-
/* NTH: more needs to be done to correctly clean up here */
92-
free (reqs);
93-
free (oversub);
94-
return err;
95-
}
96-
97-
for(i = 0; i < num_nodes; i++)
98-
oversubscribed += oversub[i];
99-
100-
free(oversub);
101-
free(reqs);
102-
} else {
103-
if (rank == local_procs[0])
104-
if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&local_oversub, 1, MPI_INT, 0,
105-
111, MCA_PML_BASE_SEND_STANDARD, comm_old))))
106-
return err;
107-
}
10882

109-
if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_bcast(&oversubscribed, 1,
110-
MPI_INT, 0, comm_old,
111-
comm_old->c_coll->coll_bcast_module)))
83+
if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_allreduce(&local_oversub, &oversubscribed, 1, MPI_INT,
84+
MPI_SUM, comm_old, comm_old->c_coll.coll_allreduce_module)))
11285
return err;
11386

11487
return oversubscribed;
@@ -162,7 +135,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
162135
int num_procs_in_node = 0;
163136
int rank, size;
164137
int hwloc_err;
165-
int oversubscribing_objs = 0;
138+
int oversubscribing_objs = 0, oversubscribed_pus = 0;
166139
int i, j, idx;
167140
uint32_t val, *pval;
168141

@@ -268,8 +241,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
268241
hwloc_get_cpubind(opal_hwloc_topology,set,0);
269242
num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
270243

271-
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){
272-
/* processes are not bound on the machine */
244+
/**
245+
* In all situations (including heterogeneous environments) all processes must execute
246+
* all the calls that involve collective communications, so we have to lay the logic
247+
* accordingly.
248+
*/
249+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
273250
#ifdef __DEBUG__
274251
if (0 == rank)
275252
fprintf(stdout,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n");
@@ -284,60 +261,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
284261
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
285262
num_objs_in_node,num_procs_in_node,
286263
nodes_roots,local_procs,comm_old);
287-
if(oversubscribing_objs) {
264+
} else { /* the processes are already bound */
265+
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
266+
obj_rank = object->logical_index;
267+
effective_depth = object->depth;
268+
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
269+
270+
/* Check for oversubscribing */
271+
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
272+
num_objs_in_node,num_procs_in_node,
273+
nodes_roots,local_procs,comm_old);
274+
}
275+
276+
if(oversubscribing_objs) {
277+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
288278
#ifdef __DEBUG__
289279
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n");
290280
#endif
291-
int oversubscribed_pus = check_oversubscribing(rank,num_nodes,
292-
num_pus_in_node,num_procs_in_node,
293-
nodes_roots,local_procs,comm_old);
294-
if (oversubscribed_pus){
295-
#ifdef __DEBUG__
296-
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
297-
#endif
298-
FALLBACK();
299-
} else {
281+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
282+
num_pus_in_node,num_procs_in_node,
283+
nodes_roots,local_procs,comm_old);
284+
} else {
285+
/* Bound processes will participate with the same data as before */
286+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
287+
num_objs_in_node,num_procs_in_node,
288+
nodes_roots,local_procs,comm_old);
289+
}
290+
if (!oversubscribed_pus) {
291+
/* Update the data used to compute the correct binding */
292+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
300293
obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
301294
effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
302295
num_objs_in_node = num_pus_in_node;
303296
#ifdef __DEBUG__
304297
fprintf(stdout,"Process not bound : binding on PU#%i \n",obj_rank);
305298
#endif
306299
}
307-
} else {
308-
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
309-
effective_depth = depth;
310-
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
311-
if( NULL == object) FALLBACK();
312-
313-
hwloc_bitmap_copy(set,object->cpuset);
314-
hwloc_bitmap_singlify(set); /* we don't want the process to move */
315-
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
316-
if( -1 == hwloc_err) FALLBACK();
317-
#ifdef __DEBUG__
318-
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
319-
#endif
320300
}
321-
} else { /* the processes are already bound */
322-
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
323-
obj_rank = object->logical_index;
324-
effective_depth = object->depth;
325-
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
301+
}
326302

327-
/* Check for oversubscribing */
328-
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
329-
num_objs_in_node,num_procs_in_node,
330-
nodes_roots,local_procs,comm_old);
331-
if(oversubscribing_objs) {
303+
if( !oversubscribing_objs && !oversubscribed_pus ) {
304+
if( hwloc_bitmap_isincluded(root_obj->cpuset,set) ) { /* processes are not bound on the machine */
305+
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
306+
effective_depth = depth;
307+
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
308+
if( NULL == object) FALLBACK();
309+
310+
hwloc_bitmap_copy(set,object->cpuset);
311+
hwloc_bitmap_singlify(set); /* we don't want the process to move */
312+
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
313+
if( -1 == hwloc_err) FALLBACK();
314+
#ifdef __DEBUG__
315+
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
316+
#endif
317+
} else {
332318
#ifdef __DEBUG__
333-
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n");
319+
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
320+
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
334321
#endif
335-
FALLBACK();
336322
}
323+
} else {
337324
#ifdef __DEBUG__
338-
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
339-
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
325+
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
340326
#endif
327+
FALLBACK();
341328
}
342329

343330
reqs = (MPI_Request *)calloc(num_procs_in_node-1,sizeof(MPI_Request));
@@ -492,7 +479,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
492479
for(i = 1; i < num_nodes ; i++)
493480
displs[i] = displs[i-1] + objs_per_node[i-1];
494481

495-
memset(reqs,0,(num_nodes-1)*sizeof(MPI_Request));
496482
memcpy(obj_mapping,obj_to_rank_in_comm,objs_per_node[0]*sizeof(int));
497483
for(i = 1; i < num_nodes ; i++)
498484
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displs[i], objs_per_node[i], MPI_INT,

0 commit comments

Comments
 (0)