Skip to content

Commit f9ffa85

Browse files
authored
Merge pull request #3079 from jsquyres/pr/v2.0.x/bosilca-fixes
v2.0.x: TCP BTL fixes
2 parents 9b222d8 + 970d8ae commit f9ffa85

File tree

3 files changed

+57
-11
lines changed

3 files changed

+57
-11
lines changed

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1818
* reserved.
19-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
19+
* Copyright (c) 2015-2017 Cisco Systems, Inc. All rights reserved
2020
* $COPYRIGHT$
2121
*
2222
* Additional copyrights may follow
@@ -41,6 +41,7 @@
4141
#include "opal/util/if.h"
4242
#include "opal/util/net.h"
4343
#include "opal/util/proc.h"
44+
#include "opal/util/show_help.h"
4445

4546
#include "btl_tcp.h"
4647
#include "btl_tcp_proc.h"
@@ -122,16 +123,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
122123
return btl_proc;
123124
}
124125

125-
do {
126+
do { /* This loop is only necessary so that we can break out of the serial code */
126127
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
127128
if(NULL == btl_proc) {
128129
rc = OPAL_ERR_OUT_OF_RESOURCE;
129130
break;
130131
}
131132

132-
btl_proc->proc_opal = proc;
133-
134-
OBJ_RETAIN(btl_proc->proc_opal);
133+
/* Retain the proc, but don't store the ref into the btl_proc just yet. This
134+
* provides a way to release the btl_proc in case of failure without having to
135+
* unlock the mutex.
136+
*/
137+
OBJ_RETAIN(proc);
135138

136139
/* lookup tcp parameters exported by this proc */
137140
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
@@ -181,12 +184,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
181184
} while (0);
182185

183186
if (OPAL_SUCCESS == rc) {
187+
btl_proc->proc_opal = proc; /* link with the proc */
184188
/* add to hash table of all proc instance. */
185189
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
186190
proc->proc_name, btl_proc);
187191
} else {
188192
if (btl_proc) {
189-
OBJ_RELEASE(btl_proc);
193+
OBJ_RELEASE(btl_proc); /* release the local proc */
194+
OBJ_RELEASE(proc); /* and the ref on the OMPI proc */
190195
btl_proc = NULL;
191196
}
192197
}
@@ -823,9 +828,38 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
823828
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
824829
return;
825830
}
826-
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
827831
/* No further use of this socket. Close it */
828832
CLOSE_THE_SOCKET(sd);
833+
{
834+
size_t len = 1024;
835+
char* addr_str = (char*)malloc(len);
836+
if( NULL != addr_str ) {
837+
memset(addr_str, 0, len);
838+
for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) {
839+
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
840+
if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
841+
continue;
842+
}
843+
844+
if (addr_str[0] != '\0') {
845+
strncat(addr_str, ", ", len);
846+
len -= 2;
847+
}
848+
strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet,
849+
addr_str + 1024 - len, INET6_ADDRSTRLEN), len);
850+
len = 1024 - strlen(addr_str);
851+
}
852+
}
853+
opal_show_help("help-mpi-btl-tcp.txt", "dropped inbound connection",
854+
true, opal_process_info.nodename,
855+
getpid(),
856+
btl_proc->proc_opal->proc_hostname,
857+
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
858+
opal_net_get_hostname((struct sockaddr*)addr),
859+
addr_str);
860+
free(addr_str);
861+
}
862+
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
829863
}
830864

831865
/*

opal/mca/btl/tcp/help-mpi-btl-tcp.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- text -*-
22
#
3-
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
3+
# Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved
44
# Copyright (c) 2015-2016 The University of Tennessee and The University
55
# of Tennessee Research Foundation. All rights
66
# reserved.
@@ -85,3 +85,18 @@ or other external events.
8585
Local PID: %d
8686
Peer host: %s
8787
#
88+
[dropped inbound connection]
89+
Open MPI detected an inbound MPI TCP connection request from a peer
90+
that appears to be part of this MPI job (i.e., it identified itself as
91+
part of this Open MPI job), but it is from an IP address that is
92+
unexpected. This is highly unusual.
93+
94+
The inbound connection has been dropped, and the peer should simply
95+
try again with a different IP interface (i.e., the job should
96+
hopefully be able to continue).
97+
98+
Local host: %s
99+
Local PID: %d
100+
Peer hostname: %s (%s)
101+
Source IP of socket: %s
102+
Known IPs of peer: %s

opal/util/net.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
385385
if(NULL == inet_ntop(AF_INET6, &((struct sockaddr_in6*) addr)->sin6_addr,
386386
name, NI_MAXHOST)) {
387387
opal_output(0, "opal_sockaddr2str failed with error code %d", errno);
388-
free(name);
389388
return NULL;
390389
}
391390
return name;
@@ -394,7 +393,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
394393
#endif
395394
break;
396395
default:
397-
free(name);
398396
return NULL;
399397
}
400398

@@ -405,7 +403,6 @@ opal_net_get_hostname(const struct sockaddr *addr)
405403
int err = errno;
406404
opal_output (0, "opal_sockaddr2str failed:%s (return code %i)\n",
407405
gai_strerror(err), error);
408-
free (name);
409406
return NULL;
410407
}
411408
/* strip any trailing % data as it isn't pertinent */

0 commit comments

Comments
 (0)