Skip to content

Commit cde0e6f

Browse files
committed
Set a hard limit on the TCP max fragment size.
Some OSes have hardcoded limits to prevent overflowing over an int32_t. We can either detect this at configure (which might be a nicer but incomplete solution), or always force the pipelined protocol over TCP. As it only covers data larger than 1GB, no performance penalty is to be expected. Signed-off-by: George Bosilca <[email protected]>
1 parent 252281f commit cde0e6f

File tree

2 files changed

+9
-3
lines changed

2 files changed

+9
-3
lines changed

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,9 @@ static int mca_btl_tcp_component_register(void)
303303
mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
304304
mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
305305
mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
306-
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
306+
/* Some OSes have hard coded limits on how many bytes can be manipulated by each writev operation.
307+
* Force a reasonable limit, to prevent overflowing a 32-bit integer (limit comes from BSD and OS X) */
308+
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
307309
mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
308310
mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
309311
MCA_BTL_FLAGS_SEND_INPLACE |
@@ -320,7 +322,11 @@ static int mca_btl_tcp_component_register(void)
320322

321323
mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
322324
&mca_btl_tcp_module.super);
323-
325+
if (mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size > ((1UL<<31) - 1024) ) {
326+
/* Assume a hard limit. A test in configure would be a better solution, but until then
327+
* kicking-in the pipeline RDMA for extremely large data is good enough. */
328+
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
329+
}
324330
mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_disable_family);
325331

326332
return mca_btl_tcp_component_verify();

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
173173
num_vecs = frag->iov_cnt;
174174
#if MCA_BTL_TCP_ENDPOINT_CACHE
175175
if( 0 != btl_endpoint->endpoint_cache_length ) {
176-
ssize_t length;
176+
size_t length;
177177
/* It's strange at the first look but cnt have to be set to the full amount of data
178178
* available. After going to advance_iov_position we will use cnt to detect if there
179179
* is still some data pending.

0 commit comments

Comments
 (0)