1919 * Copyright (c) 2017 Ian Bradley Morgan and Anthony Skjellum. All
2020 * rights reserved.
2121 * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
22+ * Copyright (c) 2020 Bull SAS. All rights reserved.
2223 * $COPYRIGHT$
2324 *
2425 * Additional copyrights may follow
3435#include "mpi.h"
3536#include "ompi/mca/coll/coll.h"
3637#include "ompi/communicator/communicator.h"
38+ #include "ompi/mca/coll/base/coll_base_dynamic_file.h"
39+ #include "opal/util/show_help.h"
3740
3841/*
3942 * Public string showing the coll ompi_libnbc component version number
@@ -44,61 +47,6 @@ const char *mca_coll_libnbc_component_version_string =
4447
4548static int libnbc_priority = 10 ;
4649static bool libnbc_in_progress = false; /* protect from recursive calls */
47- bool libnbc_ibcast_skip_dt_decision = true;
48-
49- int libnbc_iallgather_algorithm = 0 ; /* iallgather user forced algorithm */
50- static mca_base_var_enum_value_t iallgather_algorithms [] = {
51- {0 , "ignore" },
52- {1 , "linear" },
53- {2 , "recursive_doubling" },
54- {0 , NULL }
55- };
56-
57- int libnbc_iallreduce_algorithm = 0 ; /* iallreduce user forced algorithm */
58- static mca_base_var_enum_value_t iallreduce_algorithms [] = {
59- {0 , "ignore" },
60- {1 , "ring" },
61- {2 , "binomial" },
62- {3 , "rabenseifner" },
63- {4 , "recursive_doubling" },
64- {0 , NULL }
65- };
66-
67- int libnbc_ibcast_algorithm = 0 ; /* ibcast user forced algorithm */
68- int libnbc_ibcast_knomial_radix = 4 ;
69- static mca_base_var_enum_value_t ibcast_algorithms [] = {
70- {0 , "ignore" },
71- {1 , "linear" },
72- {2 , "binomial" },
73- {3 , "chain" },
74- {4 , "knomial" },
75- {0 , NULL }
76- };
77-
78- int libnbc_iexscan_algorithm = 0 ; /* iexscan user forced algorithm */
79- static mca_base_var_enum_value_t iexscan_algorithms [] = {
80- {0 , "ignore" },
81- {1 , "linear" },
82- {2 , "recursive_doubling" },
83- {0 , NULL }
84- };
85-
86- int libnbc_ireduce_algorithm = 0 ; /* ireduce user forced algorithm */
87- static mca_base_var_enum_value_t ireduce_algorithms [] = {
88- {0 , "ignore" },
89- {1 , "chain" },
90- {2 , "binomial" },
91- {3 , "rabenseifner" },
92- {0 , NULL }
93- };
94-
95- int libnbc_iscan_algorithm = 0 ; /* iscan user forced algorithm */
96- static mca_base_var_enum_value_t iscan_algorithms [] = {
97- {0 , "ignore" },
98- {1 , "linear" },
99- {2 , "recursive_doubling" },
100- {0 , NULL }
101- };
10250
10351static int libnbc_open (void );
10452static int libnbc_close (void );
@@ -145,6 +93,38 @@ static int
14593libnbc_open (void )
14694{
14795 int ret ;
96+ if (mca_coll_libnbc_component .dynamic_rules_verbose > 0 ) {
97+ mca_coll_libnbc_component .stream = opal_output_open (NULL );
98+ opal_output_set_verbosity (mca_coll_libnbc_component .stream , mca_coll_libnbc_component .dynamic_rules_verbose );
99+ } else {
100+ mca_coll_libnbc_component .stream = -1 ;
101+ }
102+ if (mca_coll_libnbc_component .dynamic_rules_filename ) {
103+ int rc ;
104+ opal_output_verbose (10 , mca_coll_libnbc_component .stream ,
105+ "coll:libnbc:component_open Reading collective rules file [%s] which format is %d" ,
106+ mca_coll_libnbc_component .dynamic_rules_filename ,
107+ mca_coll_libnbc_component .dynamic_rules_fileformat );
108+ rc = ompi_coll_base_read_rules_config_file ( mca_coll_libnbc_component .dynamic_rules_filename ,
109+ mca_coll_libnbc_component .dynamic_rules_fileformat ,
110+ & (mca_coll_libnbc_component .all_base_rules ), COLLCOUNT );
111+ if ( rc >= 0 ) {
112+ opal_output_verbose (10 , mca_coll_libnbc_component .stream ,"coll:libnbc:module_open Read %d valid rules\n" , rc );
113+ if (ompi_coll_base_framework .framework_verbose >= 50 ) {
114+ ompi_coll_base_dump_all_rules (mca_coll_libnbc_component .all_base_rules , COLLCOUNT );
115+ }
116+ } else {
117+ opal_output_verbose (1 , mca_coll_libnbc_component .stream ,"coll:libnbc:module_open Reading collective rules file failed\n" );
118+ char error_name [12 ];
119+ sprintf (error_name ,"file fail%1d" , rc );
120+ error_name [11 ] = '\0' ;
121+ opal_show_help ("help-mpi-coll-libnbc.txt" , (const char * )error_name , true,
122+ mca_coll_libnbc_component .dynamic_rules_filename , mca_coll_libnbc_component .dynamic_rules_fileformat );
123+ mca_coll_libnbc_component .all_base_rules = NULL ;
124+ }
125+ } else {
126+ mca_coll_libnbc_component .all_base_rules = NULL ;
127+ }
148128
149129 OBJ_CONSTRUCT (& mca_coll_libnbc_component .requests , opal_free_list_t );
150130 OBJ_CONSTRUCT (& mca_coll_libnbc_component .active_requests , opal_list_t );
@@ -173,6 +153,14 @@ libnbc_close(void)
173153 OBJ_DESTRUCT (& mca_coll_libnbc_component .active_requests );
174154 OBJ_DESTRUCT (& mca_coll_libnbc_component .lock );
175155
156+ if ( NULL != mca_coll_libnbc_component .all_base_rules ) {
157+ ompi_coll_base_free_all_rules (mca_coll_libnbc_component .all_base_rules , COLLCOUNT );
158+ mca_coll_libnbc_component .all_base_rules = NULL ;
159+ }
160+ /* close stream */
161+ if (mca_coll_libnbc_component .stream >= 0 ) {
162+ opal_output_close (mca_coll_libnbc_component .stream );
163+ }
176164 return OMPI_SUCCESS ;
177165}
178166
@@ -191,94 +179,42 @@ libnbc_register(void)
191179 MCA_BASE_VAR_SCOPE_READONLY ,
192180 & libnbc_priority );
193181
194- /* ibcast decision function can make the wrong decision if a legal
195- * non-uniform data type signature is used. This has resulted in the
196- * collective operation failing, and possibly producing wrong answers.
197- * We are investigating a fix for this problem, but it is taking a while.
198- * https://github.com/open-mpi/ompi/issues/2256
199- * https://github.com/open-mpi/ompi/issues/1763
200- * As a result we are adding an MCA parameter to make a conservative
201- * decision to avoid this issue. If the user knows that their application
202- * does not use data types in this way, then they can set this parameter
203- * to get the old behavior. Once the issue is truely fixed, then this
204- * parameter can be removed.
205- */
206- libnbc_ibcast_skip_dt_decision = true;
207- (void ) mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
208- "ibcast_skip_dt_decision" ,
209- "In ibcast only use size of communicator to choose algorithm, exclude data type signature. Set to 'false' to use data type signature in decision. WARNING: If you set this to 'false' then your application should not use non-uniform data type signatures in calls to ibcast." ,
210- MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 ,
182+ mca_coll_libnbc_component .dynamic_rules_verbose = 0 ;
183+ (void ) mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version , "dynamic_rules_verbose" ,
184+ "Verbose level of the libnbc coll component regarding on dynamic rules."
185+ " Examples: 0: no verbose, 1: selection errors, 10: selection output" ,
186+ MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
211187 OPAL_INFO_LVL_9 ,
212188 MCA_BASE_VAR_SCOPE_READONLY ,
213- & libnbc_ibcast_skip_dt_decision );
214-
215- libnbc_iallgather_algorithm = 0 ;
216- (void ) mca_base_var_enum_create ("coll_libnbc_iallgather_algorithms" , iallgather_algorithms , & new_enum );
217- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
218- "iallgather_algorithm" ,
219- "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling" ,
220- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
221- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
222- & libnbc_iallgather_algorithm );
223- OBJ_RELEASE (new_enum );
224-
225- libnbc_iallreduce_algorithm = 0 ;
226- (void ) mca_base_var_enum_create ("coll_libnbc_iallreduce_algorithms" , iallreduce_algorithms , & new_enum );
227- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
228- "iallreduce_algorithm" ,
229- "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling" ,
230- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
231- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
232- & libnbc_iallreduce_algorithm );
233- OBJ_RELEASE (new_enum );
234-
235- libnbc_ibcast_algorithm = 0 ;
236- (void ) mca_base_var_enum_create ("coll_libnbc_ibcast_algorithms" , ibcast_algorithms , & new_enum );
237- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
238- "ibcast_algorithm" ,
239- "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial" ,
240- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
241- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
242- & libnbc_ibcast_algorithm );
243- OBJ_RELEASE (new_enum );
244-
245- libnbc_ibcast_knomial_radix = 4 ;
189+ & mca_coll_libnbc_component .dynamic_rules_verbose );
190+
191+ mca_coll_libnbc_component .dynamic_rules_filename = NULL ;
246192 (void ) mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
247- "ibcast_knomial_radix" , "k-nomial tree radix for the ibcast algorithm (radix > 1)" ,
193+ "dynamic_rules_filename" ,
194+ "Filename of configuration file that contains the dynamic (@runtime) decision function rules" ,
195+ MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 ,
196+ OPAL_INFO_LVL_6 ,
197+ MCA_BASE_VAR_SCOPE_READONLY ,
198+ & mca_coll_libnbc_component .dynamic_rules_filename );
199+
200+ mca_coll_libnbc_component .dynamic_rules_fileformat = 0 ;
201+ (void ) mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
202+ "dynamic_rules_fileformat" ,
203+ "Format of configuration file that contains the dynamic (@runtime) decision function rules. Accepted values are: 0 <comm_size, msg_size>, 1 <nodes_nb, comm_size, msg_size>" ,
248204 MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
249- OPAL_INFO_LVL_9 ,
205+ OPAL_INFO_LVL_6 ,
250206 MCA_BASE_VAR_SCOPE_READONLY ,
251- & libnbc_ibcast_knomial_radix );
252-
253- libnbc_iexscan_algorithm = 0 ;
254- (void ) mca_base_var_enum_create ("coll_libnbc_iexscan_algorithms" , iexscan_algorithms , & new_enum );
255- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
256- "iexscan_algorithm" ,
257- "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling" ,
258- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
259- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
260- & libnbc_iexscan_algorithm );
261- OBJ_RELEASE (new_enum );
262-
263- libnbc_ireduce_algorithm = 0 ;
264- (void ) mca_base_var_enum_create ("coll_libnbc_ireduce_algorithms" , ireduce_algorithms , & new_enum );
265- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
266- "ireduce_algorithm" ,
267- "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner" ,
268- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
269- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
270- & libnbc_ireduce_algorithm );
271- OBJ_RELEASE (new_enum );
272-
273- libnbc_iscan_algorithm = 0 ;
274- (void ) mca_base_var_enum_create ("coll_libnbc_iscan_algorithms" , iscan_algorithms , & new_enum );
275- mca_base_component_var_register (& mca_coll_libnbc_component .super .collm_version ,
276- "iscan_algorithm" ,
277- "Which iscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling" ,
278- MCA_BASE_VAR_TYPE_INT , new_enum , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
279- OPAL_INFO_LVL_5 , MCA_BASE_VAR_SCOPE_ALL ,
280- & libnbc_iscan_algorithm );
281- OBJ_RELEASE (new_enum );
207+ & mca_coll_libnbc_component .dynamic_rules_fileformat );
208+
209+ ompi_coll_libnbc_allgather_check_forced_init ();
210+ ompi_coll_libnbc_allreduce_check_forced_init ();
211+ ompi_coll_libnbc_alltoall_check_forced_init ();
212+ ompi_coll_libnbc_alltoallv_check_forced_init ();
213+ ompi_coll_libnbc_alltoallw_check_forced_init ();
214+ ompi_coll_libnbc_bcast_check_forced_init ();
215+ ompi_coll_libnbc_exscan_check_forced_init ();
216+ ompi_coll_libnbc_reduce_check_forced_init ();
217+ ompi_coll_libnbc_scan_check_forced_init ();
282218
283219 return OMPI_SUCCESS ;
284220}
@@ -417,6 +353,27 @@ static int
417353libnbc_module_enable (mca_coll_base_module_t * module ,
418354 struct ompi_communicator_t * comm )
419355{
356+ ompi_coll_libnbc_module_t * nbc_module = (ompi_coll_libnbc_module_t * ) module ;
357+ int i ;
358+ if (mca_coll_libnbc_component .all_base_rules ) {
359+ int size , nnodes ;
360+ /* Allocate the data that hangs off the communicator */
361+ if (OMPI_COMM_IS_INTER (comm )) {
362+ size = ompi_comm_remote_size (comm );
363+ } else {
364+ size = ompi_comm_size (comm );
365+ }
366+ /* Get the number of nodes in communicator */
367+ nnodes = ompi_coll_base_get_nnodes (comm );
368+ for (i = 0 ;i < COLLCOUNT ;i ++ ) {
369+ nbc_module -> com_rules [i ] = ompi_coll_base_get_com_rule_ptr (mca_coll_libnbc_component .all_base_rules ,
370+ i , nnodes , size );
371+ }
372+ } else {
373+ for (i = 0 ;i < COLLCOUNT ;i ++ ) {
374+ nbc_module -> com_rules [i ] = NULL ;
375+ }
376+ }
420377 /* All done */
421378 return OMPI_SUCCESS ;
422379}
0 commit comments