Skip to content

Commit 1aa6fcc

Browse files
committed
Signed-off-by: Anandhi S Jayakumar <[email protected]>
Handling the OPAL_MODEX_SEND/RECV generically for all ofi providers. modified: ../orte/mca/rml/ofi/rml_ofi.h modified: ../orte/mca/rml/ofi/rml_ofi_component.c modified: ../orte/mca/rml/ofi/rml_ofi_send.c
1 parent e34aa7b commit 1aa6fcc

File tree

3 files changed

+35
-126
lines changed

3 files changed

+35
-126
lines changed

orte/mca/rml/ofi/rml_ofi.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,8 @@ typedef struct orte_rml_ofi_module_t orte_rml_ofi_module_t;
160160

161161
typedef struct {
162162
opal_object_t super;
163-
void* socket_ep;
164-
size_t socket_ep_len;
165-
void* psmx_ep;
166-
size_t psmx_ep_len;
163+
void* ofi_ep;
164+
size_t ofi_ep_len;
167165
} orte_rml_ofi_peer_t;
168166
OBJ_CLASS_DECLARATION(orte_rml_ofi_peer_t);
169167

orte/mca/rml/ofi/rml_ofi_component.c

Lines changed: 14 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -714,22 +714,7 @@ static int rml_ofi_component_init(void)
714714
/* Register the ofi address of this peer with PMIX server only if it is a user process /
715715
* for daemons the set/get_contact_info is used to exchange this information */
716716
if (ORTE_PROC_IS_APP) {
717-
switch ( orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->addr_format)
718-
{
719-
case FI_SOCKADDR_IN :
720-
opal_output_verbose(20,orte_rml_base_framework.framework_output,
721-
"%s:%d In FI_SOCKADDR_IN. ",__FILE__,__LINE__);
722-
/* Address is of type sockaddr_in (IPv4) */
723-
opal_output_verbose(20,orte_rml_base_framework.framework_output,
724-
"%s sending Opal modex string for ofi_prov_it %d, epnamelen = %d ",
725-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),cur_ofi_prov,orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen);
726-
/*[debug] - print the sockaddr - port and s_addr */
727-
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name;
728-
opal_output_verbose(20,orte_rml_base_framework.framework_output,
729-
"%s port = 0x%x, InternetAddr = %s ",
730-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
731-
/*[end debug]*/
732-
asprintf(&pmix_key,"%s%d",OPAL_RML_OFI_FI_SOCKADDR_IN,cur_ofi_prov);
717+
asprintf(&pmix_key,"%s%d",orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->fabric_attr->prov_name,cur_ofi_prov);
733718
opal_output_verbose(25, orte_rml_base_framework.framework_output,
734719
"%s calling OPAL_MODEX_SEND_STRING key - %s ",
735720
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), pmix_key );
@@ -746,35 +731,7 @@ static int rml_ofi_component_init(void)
746731
/*abort this current transport, but check if next transport can be opened*/
747732
continue;
748733
}
749-
break;
750-
case FI_ADDR_PSMX :
751-
opal_output_verbose(20,orte_rml_base_framework.framework_output,
752-
"%s:%d In FI_ADDR_PSMX. ",__FILE__,__LINE__);
753-
/* Address is of type Intel proprietery PSMX */
754-
OPAL_MODEX_SEND_STRING( ret, OPAL_PMIX_GLOBAL,
755-
OPAL_RML_OFI_FI_ADDR_PSMX,orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name,
756-
orte_rml_ofi.ofi_prov[cur_ofi_prov].epnamelen);
757-
opal_output_verbose(20,orte_rml_base_framework.framework_output,
758-
"%s:%d Opal modex send completed for FI_ADDR_PSMX. ",__FILE__,__LINE__);
759-
if (ORTE_SUCCESS != ret) {
760-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
761-
"%s:%d: OPAL_MODEX_SEND failed: %s\n",
762-
__FILE__, __LINE__, fi_strerror(-ret));
763-
free_ofi_prov_resources(cur_ofi_prov);
764-
/*abort this current transport, but check if next transport can be opened*/
765-
continue;
766-
}
767-
break;
768-
default:
769-
opal_output_verbose(1,orte_rml_base_framework.framework_output,
770-
"%s:%d ERROR: Cannot register address, Unhandled addr_format - %d, ep_name - %s ",
771-
__FILE__,__LINE__,orte_rml_ofi.ofi_prov[cur_ofi_prov].fabric_info->addr_format,
772-
orte_rml_ofi.ofi_prov[cur_ofi_prov].ep_name);
773-
free_ofi_prov_resources(cur_ofi_prov);
774-
/*abort this current transport, but check if next transport can be opened*/
775-
continue;
776-
}
777-
}
734+
}
778735

779736
/**
780737
* Set the ANY_SRC address.
@@ -1046,18 +1003,16 @@ static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
10461003

10471004
static void pr_cons(orte_rml_ofi_peer_t *ptr)
10481005
{
1049-
ptr->socket_ep = NULL;
1050-
ptr->socket_ep_len = 0;
1051-
ptr->psmx_ep = NULL;
1052-
ptr->psmx_ep_len = 0;
1006+
ptr->ofi_ep = NULL;
1007+
ptr->ofi_ep_len = 0;
10531008
}
1009+
10541010
static void pr_des(orte_rml_ofi_peer_t *ptr)
10551011
{
1056-
if ( 0 < ptr->socket_ep_len)
1057-
free( ptr->socket_ep);
1058-
if ( NULL != ptr->psmx_ep )
1059-
free( ptr->psmx_ep);
1012+
if ( 0 < ptr->ofi_ep_len)
1013+
free( ptr->ofi_ep);
10601014
}
1015+
10611016
OBJ_CLASS_INSTANCE(orte_rml_ofi_peer_t,
10621017
opal_object_t,
10631018
pr_cons, pr_des);
@@ -1116,7 +1071,7 @@ static void ofi_set_contact_info (const char *uri)
11161071
*/
11171072
if (NULL == uri) {
11181073
opal_output(0, "%s: NULL URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1119-
ORTE_FORCED_TERMINATE(1);
1074+
/* [TODO] ORTE_FORCED_TERMINATE(1);*/
11201075
return;
11211076
}
11221077

@@ -1180,12 +1135,7 @@ static void process_uri( char *uri)
11801135
ui64, (void**)&pr) ||
11811136
NULL == pr) {
11821137
pr = OBJ_NEW(orte_rml_ofi_peer_t);
1183-
/* populate the peer object with the ofi addresses *
1184-
* [TBD] tot_reqd is hard-coded to 1 for test purpose, the logic needs to be changed *
1185-
* to send some ofi_prov specific information in string as identifier for the addr foll it *
1186-
* store entire string in hash table and then interpret ofi address based on ofi_prov info *
1187-
* either her or in send logic. The peer object need to be modified to hold ofi_prov info
1188-
* and ofi address in a list */
1138+
/* populate the peer object with the ofi addresses */
11891139
for(i=0; NULL != uris[i] && tot_found < tot_reqd; i++) {
11901140
ofiuri = strdup(uris[i]);
11911141
if (NULL == ofiuri) {
@@ -1197,11 +1147,11 @@ static void process_uri( char *uri)
11971147
/* Handle the OFI address types in the uri - OFIADDR(ofiaddr) */
11981148
if (0 == strncmp(ofiuri, OFIADDR, strlen(OFIADDR)) ) {
11991149
/* allocate and initialise the peer object to be inserted in hashtable */
1200-
pr->socket_ep_len = sizeof(struct sockaddr_in);
1150+
pr->ofi_ep_len = sizeof(struct sockaddr_in);
12011151
ep_sockaddr = malloc( sizeof ( struct sockaddr_in) );
12021152
/* ofiuri for socket provider is of format - ofi-socket:<sin_family,sin_addr,sin_port> */
12031153
convert_to_sockaddr(ofiuri, ep_sockaddr);
1204-
pr->socket_ep = (void *)ep_sockaddr;
1154+
pr->ofi_ep = (void *)ep_sockaddr;
12051155
tot_found++;
12061156
}
12071157
free( ofiuri);
@@ -1223,8 +1173,8 @@ static void process_uri( char *uri)
12231173
opal_output_verbose(15, orte_rml_base_framework.framework_output,
12241174
"%s: ofi sock address length = %zd ",
12251175
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1226-
pr->socket_ep_len);
1227-
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)pr->socket_ep;
1176+
pr->ofi_ep_len);
1177+
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*)pr->ofi_ep;
12281178
opal_output_verbose(15,orte_rml_base_framework.framework_output,
12291179
"%s OFI set_name() port = 0x%x, InternetAddr = %s ",
12301180
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),inet_ntoa(ep_sockaddr->sin_addr));
@@ -1261,7 +1211,6 @@ void convert_to_sockaddr( char *ofiuri, struct sockaddr_in* ep_sockaddr)
12611211
port = atoi( sin_port);
12621212
ep_sockaddr->sin_port = htons(port);
12631213
res = inet_aton(sin_addr,(struct in_addr *)&ep_sockaddr->sin_addr);
1264-
12651214
opal_output_verbose(10,orte_rml_base_framework.framework_output,
12661215
"%s OFI convert_to_sockaddr() port = 0x%x, InternetAddr = %s ",
12671216
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ntohs(ep_sockaddr->sin_port),

orte/mca/rml/ofi/rml_ofi_send.c

Lines changed: 19 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -384,64 +384,26 @@ static void send_msg(int fd, short args, void *cbdata)
384384
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
385385
ret = OPAL_SUCCESS;
386386
} else {
387-
switch ( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->addr_format)
388-
{
389-
case FI_SOCKADDR_IN :
390-
if (ORTE_PROC_IS_APP ) {
391-
asprintf(&pmix_key,"%s%d",OPAL_RML_OFI_FI_SOCKADDR_IN,ofi_prov_id);
392-
opal_output_verbose(10, orte_rml_base_framework.framework_output,
393-
"%s calling OPAL_MODEX_RECV_STRING peer - %s, key - %s ",
394-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer),pmix_key );
395-
396-
OPAL_MODEX_RECV_STRING(ret, pmix_key, peer , (char **) &dest_ep_name, &dest_ep_namelen);
397-
opal_output_verbose(10, orte_rml_base_framework.framework_output, "Returned from MODEX_RECV");
398-
free(pmix_key);
399-
} else {
400-
memcpy(&ui64, (char*)peer, sizeof(uint64_t));
401-
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
402-
ui64, (void**)&pr) || NULL == pr) {
403-
opal_output_verbose(2, orte_rml_base_framework.framework_output,
404-
"%s rml:ofi: Send failed to get peer OFI contact info ",
405-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
406-
387+
if (ORTE_PROC_IS_APP ) {
388+
asprintf(&pmix_key,"%s%d",orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name,ofi_prov_id);
389+
opal_output_verbose(10, orte_rml_base_framework.framework_output,
390+
"%s calling OPAL_MODEX_RECV_STRING peer - %s, key - %s ",
391+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer),pmix_key );
392+
OPAL_MODEX_RECV_STRING(ret, pmix_key, peer , (char **) &dest_ep_name, &dest_ep_namelen);
393+
opal_output_verbose(10, orte_rml_base_framework.framework_output, "Returned from MODEX_RECV");
394+
free(pmix_key);
395+
} else {
396+
memcpy(&ui64, (char*)peer, sizeof(uint64_t));
397+
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
398+
ui64, (void**)&pr) || NULL == pr) {
399+
opal_output_verbose(2, orte_rml_base_framework.framework_output,
400+
"%s rml:ofi: Send failed to get peer OFI contact info ",
401+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
407402
return;
408-
}
409-
dest_ep_name = pr->socket_ep;
410-
dest_ep_namelen = pr->socket_ep_len;
411-
ret = OPAL_SUCCESS;
412-
}
413-
/*print the sockaddr - port and s_addr */
414-
struct sockaddr_in* ep_sockaddr = (struct sockaddr_in*) dest_ep_name;
415-
opal_output_verbose(10,orte_rml_base_framework.framework_output,
416-
"%s obtained for peer %s port = 0x%x, InternetAddr = %s ",
417-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ORTE_NAME_PRINT(peer),ntohs(ep_sockaddr->sin_port),
418-
inet_ntoa(ep_sockaddr->sin_addr));
419-
break;
420-
case FI_ADDR_PSMX :
421-
if (ORTE_PROC_IS_APP ) {
422-
OPAL_MODEX_RECV_STRING(ret, OPAL_RML_OFI_FI_ADDR_PSMX, peer , (char **) &dest_ep_name, &dest_ep_namelen);
423-
} else {
424-
memcpy(&ui64, (char*)peer, sizeof(uint64_t));
425-
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_rml_ofi.peers,
426-
ui64, (void**)&pr) || NULL == pr) {
427-
opal_output_verbose(2, orte_rml_base_framework.framework_output,
428-
"%s rml:ofi: Send failed to get peer OFI contact info ",
429-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
430-
return;
431-
}
432-
dest_ep_name = pr->psmx_ep;
433-
dest_ep_namelen = pr->psmx_ep_len;
434-
ret = OPAL_SUCCESS;
435-
}
436-
break;
437-
default:
438-
/* we shouldn't be getting here as only above are supported and address sent
439-
* to PMIX (OPAL_MODEX_SEND) in orte_component_init() */
440-
opal_output_verbose(1, orte_rml_base_framework.framework_output,
441-
"%s Error: Unhandled address format type in ofi_send_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
442-
snd->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
443-
ORTE_RML_SEND_COMPLETE(snd);
444-
return;
403+
}
404+
dest_ep_name = pr->ofi_ep;
405+
dest_ep_namelen = pr->ofi_ep_len;
406+
ret = OPAL_SUCCESS;
445407
}
446408
}
447409
opal_output_verbose(50, orte_rml_base_framework.framework_output,

0 commit comments

Comments
 (0)