Skip to content

Commit 7515dc9

Browse files
author
rhc54
authored
Merge pull request open-mpi#11 from anandhis/topic/rmlofi
Addressed pull-request comments from Jsquyres
2 parents 591482a + f16fdfb commit 7515dc9

File tree

3 files changed

+39
-24
lines changed

3 files changed

+39
-24
lines changed

orte/mca/rml/ofi/rml_ofi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929

3030
/** the maximum open conduit - assuming system will have no more than 20 transports*/
31-
#define MAX_CONDUIT 20
31+
#define MAX_CONDUIT 40
3232

3333
/** The OPAL key values **/
3434
/* (char*) ofi socket address (type IN) of the node process is running on */

orte/mca/rml/ofi/rml_ofi_component.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ rml_ofi_component_init(int* priority)
656656
* (fi_info_list) and store it in the ofi_conduits array **/
657657
orte_rml_ofi.conduit_open_num = 0;
658658
for( fabric_info = orte_rml_ofi.fi_info_list ;
659-
NULL != fabric_info; fabric_info = fabric_info->next)
659+
NULL != fabric_info && orte_rml_ofi.conduit_open_num < MAX_CONDUIT ; fabric_info = fabric_info->next)
660660
{
661661
opal_output_verbose(100,orte_rml_base_framework.framework_output,
662662
"%s:%d beginning to add endpoint for conduit_id=%d ",__FILE__,__LINE__,orte_rml_ofi.conduit_open_num);
@@ -839,8 +839,8 @@ rml_ofi_component_init(int* priority)
839839
/*[debug] - print the sockaddr - port and s_addr */
840840
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)orte_rml_ofi.ofi_conduits[cur_conduit].ep_name;
841841
opal_output_verbose(1,orte_rml_base_framework.framework_output,
842-
"%s port = 0x%x, InternetAddr = 0x%x ",
843-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ep_sockaddr->sin_port,ep_sockaddr->sin_addr.s_addr);
842+
"%s port = 0x%x, InternetAddr = %s ",
843+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
844844
/*[end debug]*/
845845
OPAL_MODEX_SEND_STRING( ret, OPAL_PMIX_GLOBAL,
846846
OPAL_RML_OFI_FI_SOCKADDR_IN,
@@ -964,6 +964,11 @@ rml_ofi_component_init(int* priority)
964964
"%s:%d Conduit id - %d created ",__FILE__,__LINE__,orte_rml_ofi.conduit_open_num);
965965
orte_rml_ofi.conduit_open_num++;
966966
}
967+
if (fabric_info != NULL && orte_rml_ofi.conduit_open_num >= MAX_CONDUIT ) {
968+
opal_output_verbose(1,orte_rml_base_framework.framework_output,
969+
"%s:%d fi_getinfo list not fully parsed as MAX_CONDUIT - %d reached ",__FILE__,__LINE__,orte_rml_ofi.conduit_open_num);
970+
}
971+
967972

968973
}
969974
/**

orte/mca/rml/ofi/rml_ofi_send.c

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -327,19 +327,20 @@ int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t conduit_id)
327327
static void send_msg(int fd, short args, void *cbdata)
328328
{
329329
orte_rml_send_request_t *req = (orte_rml_send_request_t*)cbdata;
330-
orte_process_name_t *peer = &(req->send.dst);
331-
orte_rml_tag_t tag = req->send.tag;
332-
char *dest_ep_name;
333-
size_t dest_ep_namelen = 0;
334-
int ret = OPAL_ERROR;
330+
orte_process_name_t *peer = &(req->send.dst);
331+
orte_rml_tag_t tag = req->send.tag;
332+
char *dest_ep_name;
333+
size_t dest_ep_namelen = 0;
334+
int ret = OPAL_ERROR;
335335
uint32_t total_packets;
336-
fi_addr_t dest_fi_addr;
337-
orte_rml_send_t *snd;
338-
orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t );
339-
uint8_t conduit_id = req->conduit_id;
336+
fi_addr_t dest_fi_addr;
337+
orte_rml_send_t *snd;
338+
orte_rml_ofi_request_t* ofi_send_req = OBJ_NEW( orte_rml_ofi_request_t );
339+
uint8_t conduit_id = req->conduit_id;
340340
orte_rml_ofi_send_pkt_t* ofi_msg_pkt;
341341
size_t datalen_per_pkt, hdrsize, data_in_pkt; // the length of data in per packet excluding the header size
342342

343+
343344
snd = OBJ_NEW(orte_rml_send_t);
344345
snd->dst = *peer;
345346
snd->origin = *ORTE_PROC_MY_NAME;
@@ -367,20 +368,29 @@ static void send_msg(int fd, short args, void *cbdata)
367368
{
368369
case FI_SOCKADDR_IN :
369370
OPAL_MODEX_RECV_STRING(ret, OPAL_RML_OFI_FI_SOCKADDR_IN, peer , (char **) &dest_ep_name, &dest_ep_namelen);
371+
/*print the sockaddr - port and s_addr */
372+
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*) dest_ep_name;
373+
opal_output_verbose(10,orte_rml_base_framework.framework_output,
374+
"%s obtained for peer %s port = 0x%printinx, InternetAddr = %s ",
375+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer),ntohs(ep_sockaddr->sin_port),
376+
inet_ntoa(ep_sockaddr->sin_addr));
370377
break;
371378
case FI_ADDR_PSMX :
372379
OPAL_MODEX_RECV_STRING(ret, OPAL_RML_OFI_FI_ADDR_PSMX, peer , (char **) &dest_ep_name, &dest_ep_namelen);
373380
break;
381+
default:
382+
/* we shouldn't be getting here as only above are supported and address sent
383+
* to PMIX (OPAL_MODEX_SEND) in orte_component_init() */
384+
opal_output_verbose(1, orte_rml_base_framework.framework_output,
385+
"%s Error: Unhandled address format type in ofi_send_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
386+
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
387+
ORTE_RML_SEND_COMPLETE(snd);
388+
return;
374389
}
375-
opal_output_verbose(10, orte_rml_base_framework.framework_output,
390+
opal_output_verbose(50, orte_rml_base_framework.framework_output,
376391
"%s Return value from OPAL_MODEX_RECV_STRING - %d, length returned - %d",
377392
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret, dest_ep_namelen);
378-
/*print the sockaddr - port and s_addr */
379-
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*) dest_ep_name;
380-
opal_output_verbose(1,orte_rml_base_framework.framework_output,
381-
"%s obtained for peer %s port = 0x%x, InternetAddr = 0x%x ",
382-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer),ep_sockaddr->sin_port,
383-
ep_sockaddr->sin_addr.s_addr);
393+
384394

385395
if ( OPAL_SUCCESS == ret) {
386396
opal_output_verbose(10, orte_rml_base_framework.framework_output,
@@ -395,8 +405,8 @@ static void send_msg(int fd, short args, void *cbdata)
395405
/* call the send-callback fn with error and return, also return failure status */
396406
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
397407
ORTE_RML_SEND_COMPLETE(snd);
398-
snd = NULL;
399408
//OBJ_RELEASE( ofi_send_req);
409+
return;
400410
}
401411

402412
} else {
@@ -408,8 +418,8 @@ static void send_msg(int fd, short args, void *cbdata)
408418
/* call the send-callback fn with error and return, also return failure status */
409419
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
410420
ORTE_RML_SEND_COMPLETE(snd);
411-
snd = NULL;
412421
//OBJ_RELEASE( ofi_send_req);
422+
return;
413423
}
414424

415425
ofi_send_req->send = snd;
@@ -562,7 +572,7 @@ int orte_rml_ofi_send_transport_nb(int conduit_id,
562572
ORTE_NAME_PRINT(peer), tag);
563573

564574

565-
if( (0 > conduit_id) || ( conduit_id > orte_rml_ofi.conduit_open_num ) ) {
575+
if( (0 > conduit_id) || ( conduit_id >= orte_rml_ofi.conduit_open_num ) ) {
566576
/* Invalid conduit ID provided */
567577
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
568578
return ORTE_ERR_BAD_PARAM;
@@ -614,7 +624,7 @@ int orte_rml_ofi_send_buffer_transport_nb(int conduit_id,
614624
ORTE_NAME_PRINT(peer), tag);
615625

616626

617-
if( (0 > conduit_id) || ( conduit_id > orte_rml_ofi.conduit_open_num ) ) {
627+
if( (0 > conduit_id) || ( conduit_id >= orte_rml_ofi.conduit_open_num ) ) {
618628
/* Invalid conduit ID provided */
619629
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
620630
return ORTE_ERR_BAD_PARAM;

0 commit comments

Comments
 (0)