Skip to content

Commit 3166c44

Browse files
committed
topo/treematch: fix topo_treematch_distgraph_create
1 parent 0e68a42 commit 3166c44

File tree

1 file changed

+68
-82
lines changed

1 file changed

+68
-82
lines changed

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 68 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,16 @@
5353
#define MY_STRING_SIZE 64
5454
/*#define __DEBUG__ 1 */
5555

56-
56+
/**
57+
* This function is a allreduce between all processes to detect for oversubscription.
58+
* On each node, the local_procs will be a different array, that contains only the
59+
* local processes. Thus, that process will compute the node oversubscription and will
60+
* bring this value to the operation, while every other process on the node will
61+
* contribute 0.
62+
* Doing an AllReduce might be an overkill for this situation, but it should remain
63+
* more scalable than a star reduction (between the roots of each node (nodes_roots),
64+
* followed by a bcast to all processes.
65+
*/
5766
static int check_oversubscribing(int rank,
5867
int num_nodes,
5968
int num_objs_in_node,
@@ -62,52 +71,16 @@ static int check_oversubscribing(int rank,
6271
int *local_procs,
6372
ompi_communicator_t *comm_old)
6473
{
65-
int oversubscribed = 0;
66-
int local_oversub = 0;
67-
int err;
74+
int oversubscribed = 0, local_oversub = 0, err;
6875

76+
/* Only a single process per node, the local root, compute the oversubscription condition */
6977
if (rank == local_procs[0])
7078
if(num_objs_in_node < num_procs_in_node)
7179
local_oversub = 1;
7280

73-
if (rank == 0) {
74-
MPI_Request *reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
75-
int *oversub = (int *)calloc(num_nodes, sizeof(int));
76-
int i;
77-
78-
oversub[0] = local_oversub;
79-
for(i = 1; i < num_nodes; i++)
80-
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&oversub[i], 1, MPI_INT,
81-
nodes_roots[i], 111, comm_old, &reqs[i-1])))) {
82-
/* NTH: more needs to be done to correctly clean up here */
83-
free (reqs);
84-
free (oversub);
85-
return err;
86-
}
87-
88-
if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes-1,
89-
reqs, MPI_STATUSES_IGNORE))) {
90-
/* NTH: more needs to be done to correctly clean up here */
91-
free (reqs);
92-
free (oversub);
93-
return err;
94-
}
95-
96-
for(i = 0; i < num_nodes; i++)
97-
oversubscribed += oversub[i];
98-
99-
free(oversub);
100-
free(reqs);
101-
} else {
102-
if (rank == local_procs[0])
103-
if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&local_oversub, 1, MPI_INT, 0,
104-
111, MCA_PML_BASE_SEND_STANDARD, comm_old))))
105-
return err;
106-
}
10781

108-
if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_bcast(&oversubscribed, 1,
109-
MPI_INT, 0, comm_old,
110-
comm_old->c_coll.coll_bcast_module)))
82+
if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_allreduce(&local_oversub, &oversubscribed, 1, MPI_INT,
83+
MPI_SUM, comm_old, comm_old->c_coll.coll_allreduce_module)))
11184
return err;
11285

11386
return oversubscribed;
@@ -161,7 +134,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
161134
int num_procs_in_node = 0;
162135
int rank, size;
163136
int hwloc_err;
164-
int oversubscribing_objs = 0;
137+
int oversubscribing_objs = 0, oversubscribed_pus = 0;
165138
int i, j, idx;
166139
uint32_t val, *pval;
167140

@@ -267,8 +240,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
267240
hwloc_get_cpubind(opal_hwloc_topology,set,0);
268241
num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
269242

270-
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){
271-
/* processes are not bound on the machine */
243+
/**
244+
* In all situations (including heterogeneous environments) all processes must execute
245+
* all the calls that involve collective communications, so we have to lay the logic
246+
* accordingly.
247+
*/
248+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
272249
#ifdef __DEBUG__
273250
if (0 == rank)
274251
fprintf(stdout,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n");
@@ -283,60 +260,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
283260
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
284261
num_objs_in_node,num_procs_in_node,
285262
nodes_roots,local_procs,comm_old);
286-
if(oversubscribing_objs) {
263+
} else { /* the processes are already bound */
264+
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
265+
obj_rank = object->logical_index;
266+
effective_depth = object->depth;
267+
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
268+
269+
/* Check for oversubscribing */
270+
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
271+
num_objs_in_node,num_procs_in_node,
272+
nodes_roots,local_procs,comm_old);
273+
}
274+
275+
if(oversubscribing_objs) {
276+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
287277
#ifdef __DEBUG__
288278
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n");
289279
#endif
290-
int oversubscribed_pus = check_oversubscribing(rank,num_nodes,
291-
num_pus_in_node,num_procs_in_node,
292-
nodes_roots,local_procs,comm_old);
293-
if (oversubscribed_pus){
294-
#ifdef __DEBUG__
295-
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
296-
#endif
297-
FALLBACK();
298-
} else {
280+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
281+
num_pus_in_node,num_procs_in_node,
282+
nodes_roots,local_procs,comm_old);
283+
} else {
284+
/* Bound processes will participate with the same data as before */
285+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
286+
num_objs_in_node,num_procs_in_node,
287+
nodes_roots,local_procs,comm_old);
288+
}
289+
if (!oversubscribed_pus) {
290+
/* Update the data used to compute the correct binding */
291+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
299292
obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
300293
effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
301294
num_objs_in_node = num_pus_in_node;
302295
#ifdef __DEBUG__
303296
fprintf(stdout,"Process not bound : binding on PU#%i \n",obj_rank);
304297
#endif
305298
}
306-
} else {
307-
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
308-
effective_depth = depth;
309-
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
310-
if( NULL == object) FALLBACK();
311-
312-
hwloc_bitmap_copy(set,object->cpuset);
313-
hwloc_bitmap_singlify(set); /* we don't want the process to move */
314-
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
315-
if( -1 == hwloc_err) FALLBACK();
316-
#ifdef __DEBUG__
317-
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
318-
#endif
319299
}
320-
} else { /* the processes are already bound */
321-
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
322-
obj_rank = object->logical_index;
323-
effective_depth = object->depth;
324-
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
300+
}
325301

326-
/* Check for oversubscribing */
327-
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
328-
num_objs_in_node,num_procs_in_node,
329-
nodes_roots,local_procs,comm_old);
330-
if(oversubscribing_objs) {
302+
if( !oversubscribing_objs && !oversubscribed_pus ) {
303+
if( hwloc_bitmap_isincluded(root_obj->cpuset,set) ) { /* processes are not bound on the machine */
304+
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
305+
effective_depth = depth;
306+
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
307+
if( NULL == object) FALLBACK();
308+
309+
hwloc_bitmap_copy(set,object->cpuset);
310+
hwloc_bitmap_singlify(set); /* we don't want the process to move */
311+
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
312+
if( -1 == hwloc_err) FALLBACK();
313+
#ifdef __DEBUG__
314+
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
315+
#endif
316+
} else {
331317
#ifdef __DEBUG__
332-
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n");
318+
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
319+
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
333320
#endif
334-
FALLBACK();
335321
}
322+
} else {
336323
#ifdef __DEBUG__
337-
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
338-
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
324+
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
339325
#endif
326+
FALLBACK();
340327
}
341328

342329
reqs = (MPI_Request *)calloc(num_procs_in_node-1,sizeof(MPI_Request));
@@ -491,7 +478,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
491478
for(i = 1; i < num_nodes ; i++)
492479
displs[i] = displs[i-1] + objs_per_node[i-1];
493480

494-
memset(reqs,0,(num_nodes-1)*sizeof(MPI_Request));
495481
memcpy(obj_mapping,obj_to_rank_in_comm,objs_per_node[0]*sizeof(int));
496482
for(i = 1; i < num_nodes ; i++)
497483
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displs[i], objs_per_node[i], MPI_INT,

0 commit comments

Comments
 (0)