Skip to content

Commit d10522a

Browse files
committed
Set a hard limit on the TCP max fragment size.
Some OSes have hardcoded limits to prevent overflowing over an int32_t. We can either detect this at configure (which might be a nicer but incomplete solution), or always force the pipelined protocol over TCP. As it only covers data larger than 1GB, no performance penalty is to be expected. Signed-off-by: George Bosilca <[email protected]>
1 parent 866899e commit d10522a

File tree

2 files changed

+12
-3
lines changed

2 files changed

+12
-3
lines changed

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,12 @@ static int mca_btl_tcp_component_register(void)
318318
mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
319319
mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
320320
mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
321-
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
321+
/* Some OSes have hard coded limits on how many bytes can be manipulated
322+
* by each writev operation. Force a reasonable limit, to prevent overflowing
323+
* a signed 32-bit integer (limit comes from BSD and OS X). We remove 1k to
324+
* make some room for our internal headers.
325+
*/
326+
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
322327
mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
323328
mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
324329
MCA_BTL_FLAGS_SEND_INPLACE |
@@ -335,7 +340,11 @@ static int mca_btl_tcp_component_register(void)
335340

336341
mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
337342
&mca_btl_tcp_module.super);
338-
343+
if (mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size > ((1UL<<31) - 1024) ) {
344+
/* Assume a hard limit. A test in configure would be a better solution, but until then
345+
* kicking-in the pipeline RDMA for extremely large data is good enough. */
346+
mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
347+
}
339348
mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_disable_family);
340349

341350
return mca_btl_tcp_component_verify();

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
173173
num_vecs = frag->iov_cnt;
174174
#if MCA_BTL_TCP_ENDPOINT_CACHE
175175
if( 0 != btl_endpoint->endpoint_cache_length ) {
176-
ssize_t length;
176+
size_t length;
177177
/* It's strange at the first look but cnt have to be set to the full amount of data
178178
* available. After going to advance_iov_position we will use cnt to detect if there
179179
* is still some data pending.

0 commit comments

Comments
 (0)