Skip to content

Commit 8da4605

Browse files
committed
btl/openib: immediately release the device when no port is allowed
Many thanks to Sergey Oblomov for reporting this issue and the countless traces provided when troubleshooting it. This is a one-off commit for the v4.0.x branch since btl/openib has been removed from master. Refs. #6137 Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent c58c774 commit 8da4605

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,7 @@ int mca_btl_openib_add_procs(
10451045
opal_bitmap_clear_all_bits(reachable);
10461046
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
10471047
true, opal_process_info.nodename,
1048-
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
1048+
openib_btl->device_name, openib_btl->port_num);
10491049
return OPAL_SUCCESS;
10501050
}
10511051

@@ -1718,11 +1718,11 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
17181718
free(openib_btl->cpcs[i]);
17191719
}
17201720
free(openib_btl->cpcs);
1721-
}
17221721

1723-
/* Release device if there are no more users */
1724-
if(!(--openib_btl->device->btls)) {
1725-
OBJ_RELEASE(openib_btl->device);
1722+
/* Release device if there are no more users */
1723+
if(!(--openib_btl->device->allowed_btls)) {
1724+
OBJ_RELEASE(openib_btl->device);
1725+
}
17261726
}
17271727

17281728
if (NULL != openib_btl->qps) {

opal/mca/btl/openib/btl_openib.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ typedef struct mca_btl_openib_device_t {
392392
/* Whether this device supports eager RDMA */
393393
uint8_t use_eager_rdma;
394394
uint8_t btls; /** < number of btls using this device */
395+
uint8_t allowed_btls; /** < number of allowed btls using this device */
395396
opal_pointer_array_t *endpoints;
396397
opal_pointer_array_t *device_btls;
397398
uint16_t hp_cq_polls;
@@ -483,6 +484,7 @@ struct mca_btl_openib_module_t {
483484
uint8_t num_cpcs;
484485

485486
mca_btl_openib_device_t *device;
487+
char * device_name;
486488
uint8_t port_num; /**< ID of the PORT */
487489
uint16_t pkey_index;
488490
struct ibv_port_attr ib_port_attr;

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,10 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
648648
sizeof(mca_btl_openib_module));
649649
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
650650
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
651-
openib_btl->device = device;
652651
openib_btl->port_num = (uint8_t) port_num;
653652
openib_btl->allowed = false;
653+
openib_btl->device = NULL;
654+
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
654655
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
655656
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
656657
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
@@ -784,6 +785,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
784785
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
785786
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
786787
openib_btl->device = device;
788+
openib_btl->device_name = NULL;
787789
openib_btl->port_num = (uint8_t) port_num;
788790
openib_btl->pkey_index = pkey_index;
789791
openib_btl->lid = lid;
@@ -904,6 +906,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
904906
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
905907
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
906908
++device->btls;
909+
++device->allowed_btls;
907910
++mca_btl_openib_component.ib_num_btls;
908911
++mca_btl_openib_component.ib_allowed_btls;
909912
if (-1 != mca_btl_openib_component.ib_max_btls &&
@@ -1933,7 +1936,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
19331936
if (ib_port_attr.active_mtu < device->mtu){
19341937
device->mtu = ib_port_attr.active_mtu;
19351938
}
1936-
if (mca_btl_openib_component.apm_ports && device->btls > 0) {
1939+
if (mca_btl_openib_component.apm_ports && device->allowed_btls > 0) {
19371940
init_apm_port(device, i, ib_port_attr.lid);
19381941
break;
19391942
}
@@ -1969,7 +1972,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
19691972

19701973
/* If we made a BTL, check APM status and return. Otherwise, fall
19711974
through and destroy everything */
1972-
if (device->btls > 0) {
1975+
if (device->allowed_btls > 0) {
19731976
/* if apm was enabled it should be > 1 */
19741977
if (1 == mca_btl_openib_component.apm_ports) {
19751978
opal_show_help("help-mpi-btl-openib.txt",
@@ -2290,6 +2293,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
22902293
good:
22912294
mca_btl_openib_component.devices_count++;
22922295
return OPAL_SUCCESS;
2296+
} else if (device->btls > 0) {
2297+
/* no port is allowed to be used by btl/openib,
2298+
* so release the device right away */
2299+
OBJ_RELEASE(device);
2300+
return OPAL_SUCCESS;
22932301
}
22942302

22952303
error:

0 commit comments

Comments
 (0)