@@ -184,7 +184,7 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
184
184
have_completion = 0 ;
185
185
} else {
186
186
have_completion = 1 ;
187
- ompi_request_wait_any ( nreqs , requests , & jreq , MPI_STATUS_IGNORE );
187
+ rc = ompi_request_wait_any ( nreqs , requests , & jreq , MPI_STATUS_IGNORE );
188
188
}
189
189
int ii_send_req = jreq >= jfirst_sendreq ;
190
190
if (have_completion ) {
@@ -197,28 +197,41 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
197
197
requests [jreq ] = & ompi_request_null .request ;
198
198
if (ii_send_req && jsends_posted < ntypes_send ) {
199
199
rc = ompi_datatype_create_contiguous ( 1 , (ompi_datatype_t * )send_types [jsends_posted ], & yuck_ompi_dtype_from_opal );
200
- ompi_datatype_commit (& yuck_ompi_dtype_from_opal );
201
- MCA_PML_CALL (isend
200
+ if (rc ) break ;
201
+ rc = ompi_datatype_commit (& yuck_ompi_dtype_from_opal );
202
+ if (rc ) break ;
203
+ rc = MCA_PML_CALL (isend
202
204
(send_from_addrs [jsends_posted ], (int )send_counts [jsends_posted ], yuck_ompi_dtype_from_opal , jrank_sendto ,
203
205
MCA_COLL_BASE_TAG_ALLTOALLV , MCA_PML_BASE_SEND_STANDARD ,
204
206
comm , & requests [jreq ]));
205
- ompi_datatype_destroy ( & yuck_ompi_dtype_from_opal );
207
+ if (rc ) break ;
208
+ rc = ompi_datatype_destroy ( & yuck_ompi_dtype_from_opal );
209
+ if (rc ) break ;
210
+
206
211
jsends_posted ++ ;
207
212
}
208
213
if (!ii_send_req && jrecvs_posted < ntypes_recv ) {
209
214
rc = ompi_datatype_create_contiguous ( 1 , (ompi_datatype_t * )recv_types [jrecvs_posted ], & yuck_ompi_dtype_from_opal );
210
- ompi_datatype_commit (& yuck_ompi_dtype_from_opal );
211
- MCA_PML_CALL (irecv
215
+ if (rc ) break ;
216
+ rc = ompi_datatype_commit (& yuck_ompi_dtype_from_opal );
217
+ if (rc ) break ;
218
+ rc = MCA_PML_CALL (irecv
212
219
(recv_to_addrs [jrecvs_posted ], (int )recv_counts [jrecvs_posted ], yuck_ompi_dtype_from_opal , jrank_recvfrom ,
213
220
MCA_COLL_BASE_TAG_ALLTOALLV ,
214
221
comm , & requests [jreq ]));
215
- ompi_datatype_destroy ( & yuck_ompi_dtype_from_opal );
222
+ if (rc ) break ;
223
+ rc = ompi_datatype_destroy ( & yuck_ompi_dtype_from_opal );
224
+ if (rc ) break ;
216
225
jrecvs_posted ++ ;
217
226
}
218
227
219
-
220
228
if (rc ) { break ; };
221
229
}
230
+ if (rc ) {
231
+ opal_output_verbose (1 , mca_coll_han_component .han_output ,
232
+ "Failed in alltoallv_sendrecv_w_direct_for_debugging: jloop=%d, rc=%d\n" ,
233
+ jloop ,rc );
234
+ }
222
235
return rc ;
223
236
}
224
237
@@ -250,10 +263,16 @@ static int alltoallv_sendrecv_w(
250
263
buf_items [jbuf ] = opal_free_list_get (& mca_coll_han_component .pack_buffers );
251
264
if (buf_items [jbuf ] == NULL ) {
252
265
nbufs = jbuf - 1 ;
253
- printf ("Uh-oh, not enough buffers: %d\n" ,nbufs );
266
+ opal_output_verbose (20 , mca_coll_han_component .han_output ,
267
+ "Uh-oh, not enough buffers: %d\n" ,nbufs );
254
268
break ;
255
269
}
256
270
}
271
+ if (nbufs < 2 ) {
272
+ opal_output_verbose (1 , mca_coll_han_component .han_output ,
273
+ "ERROR: Need at least 2 buffers from mca_coll_han_component.pack_buffers!" );
274
+ return MPI_ERR_NO_MEM ;
275
+ }
257
276
258
277
size_t nreqs = nbufs ;
259
278
int jreq ;
@@ -549,22 +568,26 @@ static int decide_to_use_smsc_alg(
549
568
OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
550
569
rc = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor ,
551
570
& rdtype -> super , count_for_convertor , rbuf , 0 , & convertor );
571
+ if (rc ) goto cleanup1 ;
552
572
bufs_on_device = opal_convertor_on_device (& convertor );
553
573
need_bufs = opal_convertor_need_buffers (& convertor );
554
- rc |= opal_convertor_cleanup (& convertor );
555
- rc |= opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor ,
574
+ rc = opal_convertor_cleanup (& convertor );
575
+ if (rc ) goto cleanup1 ;
576
+ rc = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor ,
556
577
& sdtype -> super , count_for_convertor , sbuf , 0 , & convertor );
578
+ if (rc ) goto cleanup1 ;
557
579
bufs_on_device |= opal_convertor_on_device (& convertor );
558
580
opal_convertor_get_packed_size (& convertor , & packed_size_bytes );
559
581
for (int jrank = 0 ; jrank < comm_size ; jrank ++ ) {
560
582
avg_send_bytes += packed_size_bytes /count_for_convertor * ompi_count_array_get (scounts ,jrank );
561
583
}
562
584
need_bufs |= opal_convertor_need_buffers (& convertor );
563
585
564
- rc |= opal_convertor_cleanup (& convertor );
586
+ rc = opal_convertor_cleanup (& convertor );
587
+ cleanup1 :
565
588
OBJ_DESTRUCT ( & convertor );
566
-
567
589
if (rc != OMPI_SUCCESS ) { return rc ;}
590
+
568
591
avg_send_bytes = avg_send_bytes / comm_size ;
569
592
reduce_buf_input [0 ] = !!(bufs_on_device );
570
593
reduce_buf_input [1 ] = avg_send_bytes ;
@@ -613,12 +636,17 @@ int mca_coll_han_alltoallv_using_smsc(
613
636
struct ompi_communicator_t * comm ,
614
637
mca_coll_base_module_t * module )
615
638
{
639
+ int rc ;
640
+ void * * send_from_addrs = NULL ;
641
+ void * * recv_to_addrs = NULL ;
642
+ size_t * send_counts = NULL ;
643
+ size_t * recv_counts = NULL ;
644
+ opal_datatype_t * * send_types = NULL ;
645
+ opal_datatype_t * * recv_types = NULL ;
646
+ mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
616
647
617
648
OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
618
649
"Entering mca_coll_han_alltoall_using_smsc\n" ));
619
- int rc ;
620
-
621
- mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
622
650
623
651
if (!mca_smsc || !mca_smsc_base_has_feature (MCA_SMSC_FEATURE_CAN_MAP )) {
624
652
/* Assume all hosts take this path together :-\ */
@@ -654,13 +682,15 @@ int mca_coll_han_alltoallv_using_smsc(
654
682
comm , han_module -> previous_alltoallv_module );
655
683
}
656
684
657
- int w_rank = ompi_comm_rank (comm );
658
685
int w_size = ompi_comm_size (comm );
659
686
660
687
int use_smsc ;
661
688
rc = decide_to_use_smsc_alg (& use_smsc ,
662
689
sbuf , scounts , sdispls , sdtype , rbuf , rcounts , rdispls , rdtype , comm );
663
- if (rc != 0 ) { return rc ; }
690
+ if (rc != 0 ) {
691
+ opal_output_verbose (1 , mca_coll_han_component .han_output ,
692
+ "decide_to_use_smsc_alg failed during execution! rc=%d\n" , rc );
693
+ }
664
694
if (!use_smsc ) {
665
695
return han_module -> previous_alltoallv (sbuf , scounts , sdispls , sdtype , rbuf , rcounts , rdispls , rdtype ,
666
696
comm , han_module -> previous_alltoallv_module );
@@ -738,8 +768,8 @@ int mca_coll_han_alltoallv_using_smsc(
738
768
low_gather_out , sizeof (low_gather_in ), MPI_BYTE , low_comm ,
739
769
low_comm -> c_coll -> coll_allgather_module );
740
770
if (rc != 0 ) {
741
- OPAL_OUTPUT_VERBOSE (( 40 , mca_coll_han_component .han_output ,
742
- " Allgather failed with %d\n" ,rc ) );
771
+ opal_output_verbose ( 1 , mca_coll_han_component .han_output ,
772
+ "During mca_coll_han_alltoallv_using_smsc: Allgather failed with rc= %d\n" ,rc );
743
773
goto cleanup ;
744
774
}
745
775
@@ -806,12 +836,12 @@ int mca_coll_han_alltoallv_using_smsc(
806
836
peers [jrank ].recvtype = & peer_recv_types [jrank ];
807
837
}
808
838
809
- void * * send_from_addrs = malloc (sizeof (* send_from_addrs )* low_size );
810
- void * * recv_to_addrs = malloc (sizeof (* recv_to_addrs )* low_size );
811
- size_t * send_counts = malloc (sizeof (* send_counts )* low_size );
812
- size_t * recv_counts = malloc (sizeof (* recv_counts )* low_size );
813
- opal_datatype_t * * send_types = malloc (sizeof (* send_types )* low_size );
814
- opal_datatype_t * * recv_types = malloc (sizeof (* recv_types )* low_size );
839
+ send_from_addrs = malloc (sizeof (* send_from_addrs )* low_size );
840
+ recv_to_addrs = malloc (sizeof (* recv_to_addrs )* low_size );
841
+ send_counts = malloc (sizeof (* send_counts )* low_size );
842
+ recv_counts = malloc (sizeof (* recv_counts )* low_size );
843
+ send_types = malloc (sizeof (* send_types )* low_size );
844
+ recv_types = malloc (sizeof (* recv_types )* low_size );
815
845
816
846
/****
817
847
* Main exchange loop
@@ -828,6 +858,11 @@ int mca_coll_han_alltoallv_using_smsc(
828
858
ptrdiff_t peer_sextent ;
829
859
830
860
rc = opal_datatype_type_extent ( peers [jlow ].sendtype , & peer_sextent );
861
+ if (rc != 0 ) {
862
+ opal_output_verbose (1 , mca_coll_han_component .han_output ,
863
+ "opal_datatype_type_extent returned error code = %d during mca_coll_han_alltoallv_using_smsc!\n" ,rc );
864
+ goto cleanup ;
865
+ }
831
866
void * from_addr = (uint8_t * )peers [jlow ].sbuf + peers [jlow ].counts [jrank_sendto ].sdispl * peer_sextent ;
832
867
833
868
send_from_addrs [jlow ] = from_addr ;
@@ -847,20 +882,28 @@ send_types[jlow] = peers[jlow].sendtype;
847
882
send_from_addrs , send_counts , send_types , jrank_sendto , ntypes_send ,
848
883
recv_to_addrs , recv_counts , recv_types , jrank_recvfrom , ntypes_recv ,
849
884
comm );
850
- if (rc != 0 ) goto cleanup ;
851
- }
885
+ if (rc != 0 ) {
886
+ opal_output_verbose (1 , mca_coll_han_component .han_output ,
887
+ "alltoallv_sendrecv_w returned error code = %d!\n" ,rc );
888
+ goto cleanup ;
889
+ }
852
890
853
- free (send_from_addrs );
854
- free (recv_to_addrs );
855
- free (send_counts );
856
- free (recv_counts );
857
- free (send_types );
858
- free (recv_types );
859
891
892
+ }
860
893
rc = 0 ;
894
+
895
+ cleanup :
861
896
low_comm -> c_coll -> coll_barrier (low_comm , low_comm -> c_coll -> coll_barrier_module );
862
897
863
- cleanup :
898
+ if (send_from_addrs ) {
899
+ free (send_from_addrs );
900
+ free (recv_to_addrs );
901
+ free (send_counts );
902
+ free (recv_counts );
903
+ free (send_types );
904
+ free (recv_types );
905
+ }
906
+
864
907
for (int jlow = 0 ; jlow < low_size ; jlow ++ ) {
865
908
if (jlow != low_rank ) {
866
909
OBJ_DESTRUCT (& peer_send_types [jlow ]);
0 commit comments