5454#define MY_STRING_SIZE 64
5555/*#define __DEBUG__ 1 */
5656
57-
57+ /**
58+ * This function is a allreduce between all processes to detect for oversubscription.
59+ * On each node, the local_procs will be a different array, that contains only the
60+ * local processes. Thus, that process will compute the node oversubscription and will
61+ * bring this value to the operation, while every other process on the node will
62+ * contribute 0.
63+ * Doing an AllReduce might be an overkill for this situation, but it should remain
64+ * more scalable than a star reduction (between the roots of each node (nodes_roots),
65+ * followed by a bcast to all processes.
66+ */
5867static int check_oversubscribing (int rank ,
5968 int num_nodes ,
6069 int num_objs_in_node ,
@@ -63,52 +72,16 @@ static int check_oversubscribing(int rank,
6372 int * local_procs ,
6473 ompi_communicator_t * comm_old )
6574{
66- int oversubscribed = 0 ;
67- int local_oversub = 0 ;
68- int err ;
75+ int oversubscribed = 0 , local_oversub = 0 , err ;
6976
77+ /* Only a single process per node, the local root, compute the oversubscription condition */
7078 if (rank == local_procs [0 ])
7179 if (num_objs_in_node < num_procs_in_node )
7280 local_oversub = 1 ;
7381
74- if (rank == 0 ) {
75- MPI_Request * reqs = (MPI_Request * )calloc (num_nodes - 1 , sizeof (MPI_Request ));
76- int * oversub = (int * )calloc (num_nodes , sizeof (int ));
77- int i ;
78-
79- oversub [0 ] = local_oversub ;
80- for (i = 1 ; i < num_nodes ; i ++ )
81- if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (& oversub [i ], 1 , MPI_INT ,
82- nodes_roots [i ], 111 , comm_old , & reqs [i - 1 ])))) {
83- /* NTH: more needs to be done to correctly clean up here */
84- free (reqs );
85- free (oversub );
86- return err ;
87- }
88-
89- if (OMPI_SUCCESS != ( err = ompi_request_wait_all (num_nodes - 1 ,
90- reqs , MPI_STATUSES_IGNORE ))) {
91- /* NTH: more needs to be done to correctly clean up here */
92- free (reqs );
93- free (oversub );
94- return err ;
95- }
96-
97- for (i = 0 ; i < num_nodes ; i ++ )
98- oversubscribed += oversub [i ];
99-
100- free (oversub );
101- free (reqs );
102- } else {
103- if (rank == local_procs [0 ])
104- if (OMPI_SUCCESS != (err = MCA_PML_CALL (send (& local_oversub , 1 , MPI_INT , 0 ,
105- 111 , MCA_PML_BASE_SEND_STANDARD , comm_old ))))
106- return err ;
107- }
10882
109- if (OMPI_SUCCESS != (err = comm_old -> c_coll -> coll_bcast (& oversubscribed , 1 ,
110- MPI_INT , 0 , comm_old ,
111- comm_old -> c_coll -> coll_bcast_module )))
83+ if (OMPI_SUCCESS != (err = comm_old -> c_coll .coll_allreduce (& local_oversub , & oversubscribed , 1 , MPI_INT ,
84+ MPI_SUM , comm_old , comm_old -> c_coll .coll_allreduce_module )))
11285 return err ;
11386
11487 return oversubscribed ;
@@ -162,7 +135,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
162135 int num_procs_in_node = 0 ;
163136 int rank , size ;
164137 int hwloc_err ;
165- int oversubscribing_objs = 0 ;
138+ int oversubscribing_objs = 0 , oversubscribed_pus = 0 ;
166139 int i , j , idx ;
167140 uint32_t val , * pval ;
168141
@@ -268,8 +241,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
268241 hwloc_get_cpubind (opal_hwloc_topology ,set ,0 );
269242 num_pus_in_node = hwloc_get_nbobjs_by_type (opal_hwloc_topology , HWLOC_OBJ_PU );
270243
271- if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){
272- /* processes are not bound on the machine */
244+ /**
245+ * In all situations (including heterogeneous environments) all processes must execute
246+ * all the calls that involve collective communications, so we have to lay the logic
247+ * accordingly.
248+ */
249+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
273250#ifdef __DEBUG__
274251 if (0 == rank )
275252 fprintf (stdout ,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n" );
@@ -284,60 +261,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
284261 oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
285262 num_objs_in_node ,num_procs_in_node ,
286263 nodes_roots ,local_procs ,comm_old );
287- if (oversubscribing_objs ) {
264+ } else { /* the processes are already bound */
265+ object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
266+ obj_rank = object -> logical_index ;
267+ effective_depth = object -> depth ;
268+ num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
269+
270+ /* Check for oversubscribing */
271+ oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
272+ num_objs_in_node ,num_procs_in_node ,
273+ nodes_roots ,local_procs ,comm_old );
274+ }
275+
276+ if (oversubscribing_objs ) {
277+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
288278#ifdef __DEBUG__
289279 fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n" );
290280#endif
291- int oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
292- num_pus_in_node ,num_procs_in_node ,
293- nodes_roots ,local_procs ,comm_old );
294- if (oversubscribed_pus ){
295- #ifdef __DEBUG__
296- fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
297- #endif
298- FALLBACK ();
299- } else {
281+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
282+ num_pus_in_node ,num_procs_in_node ,
283+ nodes_roots ,local_procs ,comm_old );
284+ } else {
285+ /* Bound processes will participate with the same data as before */
286+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
287+ num_objs_in_node ,num_procs_in_node ,
288+ nodes_roots ,local_procs ,comm_old );
289+ }
290+ if (!oversubscribed_pus ) {
291+ /* Update the data used to compute the correct binding */
292+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
300293 obj_rank = ompi_process_info .my_local_rank %num_pus_in_node ;
301294 effective_depth = hwloc_topology_get_depth (opal_hwloc_topology ) - 1 ;
302295 num_objs_in_node = num_pus_in_node ;
303296#ifdef __DEBUG__
304297 fprintf (stdout ,"Process not bound : binding on PU#%i \n" ,obj_rank );
305298#endif
306299 }
307- } else {
308- obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
309- effective_depth = depth ;
310- object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
311- if ( NULL == object ) FALLBACK ();
312-
313- hwloc_bitmap_copy (set ,object -> cpuset );
314- hwloc_bitmap_singlify (set ); /* we don't want the process to move */
315- hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
316- if ( -1 == hwloc_err ) FALLBACK ();
317- #ifdef __DEBUG__
318- fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
319- #endif
320300 }
321- } else { /* the processes are already bound */
322- object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
323- obj_rank = object -> logical_index ;
324- effective_depth = object -> depth ;
325- num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
301+ }
326302
327- /* Check for oversubscribing */
328- oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
329- num_objs_in_node ,num_procs_in_node ,
330- nodes_roots ,local_procs ,comm_old );
331- if (oversubscribing_objs ) {
303+ if ( !oversubscribing_objs && !oversubscribed_pus ) {
304+ if ( hwloc_bitmap_isincluded (root_obj -> cpuset ,set ) ) { /* processes are not bound on the machine */
305+ obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
306+ effective_depth = depth ;
307+ object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
308+ if ( NULL == object ) FALLBACK ();
309+
310+ hwloc_bitmap_copy (set ,object -> cpuset );
311+ hwloc_bitmap_singlify (set ); /* we don't want the process to move */
312+ hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
313+ if ( -1 == hwloc_err ) FALLBACK ();
314+ #ifdef __DEBUG__
315+ fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
316+ #endif
317+ } else {
332318#ifdef __DEBUG__
333- fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n" );
319+ fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
320+ fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
334321#endif
335- FALLBACK ();
336322 }
323+ } else {
337324#ifdef __DEBUG__
338- fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
339- fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
325+ fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
340326#endif
327+ FALLBACK ();
341328 }
342329
343330 reqs = (MPI_Request * )calloc (num_procs_in_node - 1 ,sizeof (MPI_Request ));
@@ -492,7 +479,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
492479 for (i = 1 ; i < num_nodes ; i ++ )
493480 displs [i ] = displs [i - 1 ] + objs_per_node [i - 1 ];
494481
495- memset (reqs ,0 ,(num_nodes - 1 )* sizeof (MPI_Request ));
496482 memcpy (obj_mapping ,obj_to_rank_in_comm ,objs_per_node [0 ]* sizeof (int ));
497483 for (i = 1 ; i < num_nodes ; i ++ )
498484 if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (obj_mapping + displs [i ], objs_per_node [i ], MPI_INT ,
0 commit comments