5353#define MY_STRING_SIZE 64
5454/*#define __DEBUG__ 1 */
5555
56-
56+ /**
57+ * This function is a allreduce between all processes to detect for oversubscription.
58+ * On each node, the local_procs will be a different array, that contains only the
59+ * local processes. Thus, that process will compute the node oversubscription and will
60+ * bring this value to the operation, while every other process on the node will
61+ * contribute 0.
62+ * Doing an AllReduce might be an overkill for this situation, but it should remain
63+ * more scalable than a star reduction (between the roots of each node (nodes_roots),
64+ * followed by a bcast to all processes.
65+ */
5766static int check_oversubscribing (int rank ,
5867 int num_nodes ,
5968 int num_objs_in_node ,
@@ -62,52 +71,16 @@ static int check_oversubscribing(int rank,
6271 int * local_procs ,
6372 ompi_communicator_t * comm_old )
6473{
65- int oversubscribed = 0 ;
66- int local_oversub = 0 ;
67- int err ;
74+ int oversubscribed = 0 , local_oversub = 0 , err ;
6875
76+ /* Only a single process per node, the local root, compute the oversubscription condition */
6977 if (rank == local_procs [0 ])
7078 if (num_objs_in_node < num_procs_in_node )
7179 local_oversub = 1 ;
7280
73- if (rank == 0 ) {
74- MPI_Request * reqs = (MPI_Request * )calloc (num_nodes - 1 , sizeof (MPI_Request ));
75- int * oversub = (int * )calloc (num_nodes , sizeof (int ));
76- int i ;
77-
78- oversub [0 ] = local_oversub ;
79- for (i = 1 ; i < num_nodes ; i ++ )
80- if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (& oversub [i ], 1 , MPI_INT ,
81- nodes_roots [i ], 111 , comm_old , & reqs [i - 1 ])))) {
82- /* NTH: more needs to be done to correctly clean up here */
83- free (reqs );
84- free (oversub );
85- return err ;
86- }
87-
88- if (OMPI_SUCCESS != ( err = ompi_request_wait_all (num_nodes - 1 ,
89- reqs , MPI_STATUSES_IGNORE ))) {
90- /* NTH: more needs to be done to correctly clean up here */
91- free (reqs );
92- free (oversub );
93- return err ;
94- }
95-
96- for (i = 0 ; i < num_nodes ; i ++ )
97- oversubscribed += oversub [i ];
98-
99- free (oversub );
100- free (reqs );
101- } else {
102- if (rank == local_procs [0 ])
103- if (OMPI_SUCCESS != (err = MCA_PML_CALL (send (& local_oversub , 1 , MPI_INT , 0 ,
104- 111 , MCA_PML_BASE_SEND_STANDARD , comm_old ))))
105- return err ;
106- }
10781
108- if (OMPI_SUCCESS != (err = comm_old -> c_coll .coll_bcast (& oversubscribed , 1 ,
109- MPI_INT , 0 , comm_old ,
110- comm_old -> c_coll .coll_bcast_module )))
82+ if (OMPI_SUCCESS != (err = comm_old -> c_coll .coll_allreduce (& local_oversub , & oversubscribed , 1 , MPI_INT ,
83+ MPI_SUM , comm_old , comm_old -> c_coll .coll_allreduce_module )))
11184 return err ;
11285
11386 return oversubscribed ;
@@ -161,7 +134,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
161134 int num_procs_in_node = 0 ;
162135 int rank , size ;
163136 int hwloc_err ;
164- int oversubscribing_objs = 0 ;
137+ int oversubscribing_objs = 0 , oversubscribed_pus = 0 ;
165138 int i , j , idx ;
166139 uint32_t val , * pval ;
167140
@@ -267,8 +240,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
267240 hwloc_get_cpubind (opal_hwloc_topology ,set ,0 );
268241 num_pus_in_node = hwloc_get_nbobjs_by_type (opal_hwloc_topology , HWLOC_OBJ_PU );
269242
270- if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){
271- /* processes are not bound on the machine */
243+ /**
244+ * In all situations (including heterogeneous environments) all processes must execute
245+ * all the calls that involve collective communications, so we have to lay the logic
246+ * accordingly.
247+ */
248+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
272249#ifdef __DEBUG__
273250 if (0 == rank )
274251 fprintf (stdout ,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n" );
@@ -283,60 +260,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
283260 oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
284261 num_objs_in_node ,num_procs_in_node ,
285262 nodes_roots ,local_procs ,comm_old );
286- if (oversubscribing_objs ) {
263+ } else { /* the processes are already bound */
264+ object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
265+ obj_rank = object -> logical_index ;
266+ effective_depth = object -> depth ;
267+ num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
268+
269+ /* Check for oversubscribing */
270+ oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
271+ num_objs_in_node ,num_procs_in_node ,
272+ nodes_roots ,local_procs ,comm_old );
273+ }
274+
275+ if (oversubscribing_objs ) {
276+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
287277#ifdef __DEBUG__
288278 fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n" );
289279#endif
290- int oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
291- num_pus_in_node ,num_procs_in_node ,
292- nodes_roots ,local_procs ,comm_old );
293- if (oversubscribed_pus ){
294- #ifdef __DEBUG__
295- fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
296- #endif
297- FALLBACK ();
298- } else {
280+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
281+ num_pus_in_node ,num_procs_in_node ,
282+ nodes_roots ,local_procs ,comm_old );
283+ } else {
284+ /* Bound processes will participate with the same data as before */
285+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
286+ num_objs_in_node ,num_procs_in_node ,
287+ nodes_roots ,local_procs ,comm_old );
288+ }
289+ if (!oversubscribed_pus ) {
290+ /* Update the data used to compute the correct binding */
291+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
299292 obj_rank = ompi_process_info .my_local_rank %num_pus_in_node ;
300293 effective_depth = hwloc_topology_get_depth (opal_hwloc_topology ) - 1 ;
301294 num_objs_in_node = num_pus_in_node ;
302295#ifdef __DEBUG__
303296 fprintf (stdout ,"Process not bound : binding on PU#%i \n" ,obj_rank );
304297#endif
305298 }
306- } else {
307- obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
308- effective_depth = depth ;
309- object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
310- if ( NULL == object ) FALLBACK ();
311-
312- hwloc_bitmap_copy (set ,object -> cpuset );
313- hwloc_bitmap_singlify (set ); /* we don't want the process to move */
314- hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
315- if ( -1 == hwloc_err ) FALLBACK ();
316- #ifdef __DEBUG__
317- fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
318- #endif
319299 }
320- } else { /* the processes are already bound */
321- object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
322- obj_rank = object -> logical_index ;
323- effective_depth = object -> depth ;
324- num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
300+ }
325301
326- /* Check for oversubscribing */
327- oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
328- num_objs_in_node ,num_procs_in_node ,
329- nodes_roots ,local_procs ,comm_old );
330- if (oversubscribing_objs ) {
302+ if ( !oversubscribing_objs && !oversubscribed_pus ) {
303+ if ( hwloc_bitmap_isincluded (root_obj -> cpuset ,set ) ) { /* processes are not bound on the machine */
304+ obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
305+ effective_depth = depth ;
306+ object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
307+ if ( NULL == object ) FALLBACK ();
308+
309+ hwloc_bitmap_copy (set ,object -> cpuset );
310+ hwloc_bitmap_singlify (set ); /* we don't want the process to move */
311+ hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
312+ if ( -1 == hwloc_err ) FALLBACK ();
313+ #ifdef __DEBUG__
314+ fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
315+ #endif
316+ } else {
331317#ifdef __DEBUG__
332- fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n" );
318+ fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
319+ fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
333320#endif
334- FALLBACK ();
335321 }
322+ } else {
336323#ifdef __DEBUG__
337- fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
338- fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
324+ fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
339325#endif
326+ FALLBACK ();
340327 }
341328
342329 reqs = (MPI_Request * )calloc (num_procs_in_node - 1 ,sizeof (MPI_Request ));
@@ -491,7 +478,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
491478 for (i = 1 ; i < num_nodes ; i ++ )
492479 displs [i ] = displs [i - 1 ] + objs_per_node [i - 1 ];
493480
494- memset (reqs ,0 ,(num_nodes - 1 )* sizeof (MPI_Request ));
495481 memcpy (obj_mapping ,obj_to_rank_in_comm ,objs_per_node [0 ]* sizeof (int ));
496482 for (i = 1 ; i < num_nodes ; i ++ )
497483 if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (obj_mapping + displs [i ], objs_per_node [i ], MPI_INT ,
0 commit comments