Коммит
1bc366b374
@ -730,7 +730,10 @@ static void component_shutdown(void)
|
||||
while (OPAL_SUCCESS == rc) {
|
||||
if (NULL != peer) {
|
||||
OBJ_RELEASE(peer);
|
||||
opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, key, NULL);
|
||||
rc = opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, key, NULL);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
rc = opal_hash_table_get_next_key_uint64(&mca_oob_tcp_component.peers, &key,
|
||||
(void **) &peer, node, &node);
|
||||
@ -968,7 +971,10 @@ static int component_set_addr(orte_process_name_t *peer,
|
||||
if (ORTE_SUCCESS != (rc = parse_uri(af_family, host, ports, (struct sockaddr_storage*) &(maddr->addr)))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(maddr);
|
||||
opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, ui64, NULL);
|
||||
rc = opal_hash_table_set_value_uint64(&mca_oob_tcp_component.peers, ui64, NULL);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
OBJ_RELEASE(pr);
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
@ -87,7 +87,7 @@ static bool ofi_desired = false;
|
||||
bool user_override(void)
|
||||
{
|
||||
if( 0 == strcmp(initial_ofi_transports_supported, ofi_transports_supported ) )
|
||||
return false;
|
||||
return false;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
@ -939,9 +939,9 @@ int get_ofi_prov_id( opal_list_t *attributes)
|
||||
char *provider = NULL, *transport = NULL;
|
||||
char *ethernet="sockets", *fabric="psm2";
|
||||
struct fi_info *cur_fi;
|
||||
char *comp_attrib = NULL;
|
||||
char **comps;
|
||||
int i;
|
||||
char *comp_attrib = NULL;
|
||||
char **comps;
|
||||
int i;
|
||||
|
||||
/* check the list of attributes in below order
|
||||
* Attribute should have ORTE_RML_TRANSPORT_ATTRIB key
|
||||
@ -949,38 +949,41 @@ int get_ofi_prov_id( opal_list_t *attributes)
|
||||
* (or) ORTE_RML_OFI_PROV_NAME key with values "socket" or "OPA"
|
||||
* if both above attributes are missing return failure
|
||||
*/
|
||||
//if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_ATTRIB, (void**)&transport, OPAL_STRING) ) {
|
||||
//if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_ATTRIB, (void**)&transport, OPAL_STRING) ) {
|
||||
|
||||
if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
|
||||
NULL != comp_attrib) {
|
||||
comps = opal_argv_split(comp_attrib, ',');
|
||||
for (i=0; NULL != comps[i] && choice_made == false ; i++) {
|
||||
if (NULL != strstr(ofi_transports_supported, comps[i])) {
|
||||
if (0 == strcmp( comps[i], "ethernet")) {
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Opening conduit using OFI ethernet/sockets provider",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_argv_free(comps);
|
||||
provider = ethernet;
|
||||
choose_fabric = false;
|
||||
choice_made = false; /* continue to see if fabric is requested */
|
||||
} else if ( 0 == strcmp ( comps[i], "fabric")) {
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Opening conduit using OFI fabric provider",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_argv_free(comps);
|
||||
choose_fabric = true;
|
||||
provider = NULL;
|
||||
choice_made = true; /* fabric is highest priority so don't check for anymore */
|
||||
}
|
||||
}
|
||||
if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
|
||||
NULL != comp_attrib) {
|
||||
comps = opal_argv_split(comp_attrib, ',');
|
||||
for (i=0; NULL != comps[i] && choice_made == false ; i++) {
|
||||
if (NULL != strstr(ofi_transports_supported, comps[i])) {
|
||||
if (0 == strcmp( comps[i], "ethernet")) {
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Opening conduit using OFI ethernet/sockets provider",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_argv_free(comps);
|
||||
provider = ethernet;
|
||||
choose_fabric = false;
|
||||
choice_made = false; /* continue to see if fabric is requested */
|
||||
} else if ( 0 == strcmp ( comps[i], "fabric")) {
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Opening conduit using OFI fabric provider",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
opal_argv_free(comps);
|
||||
choose_fabric = true;
|
||||
provider = NULL;
|
||||
choice_made = true; /* fabric is highest priority so don't check for anymore */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if from the transport we don't know which provider we want, then check for the ORTE_RML_OFI_PROV_NAME_ATTRIB */
|
||||
if ( NULL == provider) {
|
||||
orte_get_attribute(attributes, ORTE_RML_PROVIDER_ATTRIB, (void**)&provider, OPAL_STRING);
|
||||
if (!orte_get_attribute(attributes, ORTE_RML_PROVIDER_ATTRIB, (void**)&provider, OPAL_STRING)) {
|
||||
/* ensure it remains NULL */
|
||||
provider = NULL;
|
||||
}
|
||||
}
|
||||
/* either ethernet-sockets or specific is requested. Proceed to choose that provider */
|
||||
/* either ethernet-sockets or specific is requested. Proceed to choose that provider */
|
||||
if ( NULL != provider) {
|
||||
// loop the orte_rml_ofi.ofi_provs[] and find the provider name that matches
|
||||
for ( prov_num = 0; prov_num < orte_rml_ofi.ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num++ ) {
|
||||
@ -990,24 +993,24 @@ int get_ofi_prov_id( opal_list_t *attributes)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),provider,cur_fi->fabric_attr->prov_name);
|
||||
if ( strcmp(provider,cur_fi->fabric_attr->prov_name) == 0) {
|
||||
ofi_prov_id = prov_num;
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Choosing provider %s",
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Choosing provider %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
cur_fi->fabric_attr->prov_name);
|
||||
}
|
||||
}
|
||||
} else if ( choose_fabric ) {
|
||||
// "fabric" is requested, choose the first fabric(non-ethernet) provider
|
||||
for ( prov_num = 0; prov_num < orte_rml_ofi.ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num++ ) {
|
||||
cur_fi = orte_rml_ofi.ofi_prov[prov_num].fabric_info;
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s -choosing fabric -> comparing %s != %s ",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ethernet,cur_fi->fabric_attr->prov_name);
|
||||
if ( strcmp(ethernet, cur_fi->fabric_attr->prov_name) != 0) {
|
||||
ofi_prov_id = prov_num;
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Choosing fabric provider %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),cur_fi->fabric_attr->prov_name);
|
||||
cur_fi->fabric_attr->prov_name);
|
||||
}
|
||||
}
|
||||
} else if ( choose_fabric ) {
|
||||
// "fabric" is requested, choose the first fabric(non-ethernet) provider
|
||||
for ( prov_num = 0; prov_num < orte_rml_ofi.ofi_prov_open_num && ofi_prov_id == RML_OFI_PROV_ID_INVALID ; prov_num++ ) {
|
||||
cur_fi = orte_rml_ofi.ofi_prov[prov_num].fabric_info;
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s -choosing fabric -> comparing %s != %s ",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),ethernet,cur_fi->fabric_attr->prov_name);
|
||||
if ( strcmp(ethernet, cur_fi->fabric_attr->prov_name) != 0) {
|
||||
ofi_prov_id = prov_num;
|
||||
opal_output_verbose(20,orte_rml_base_framework.framework_output,
|
||||
"%s - Choosing fabric provider %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),cur_fi->fabric_attr->prov_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1165,7 +1168,7 @@ static void pr_cons(orte_rml_ofi_peer_t *ptr)
|
||||
static void pr_des(orte_rml_ofi_peer_t *ptr)
|
||||
{
|
||||
if ( NULL != ptr->ofi_prov_name)
|
||||
free(ptr->ofi_prov_name);
|
||||
free(ptr->ofi_prov_name);
|
||||
if ( 0 < ptr->ofi_ep_len)
|
||||
free( ptr->ofi_ep);
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
@ -369,7 +370,7 @@ int orte_rml_ofi_recv_handler(struct fi_cq_data_entry *wc, uint8_t ofi_prov_id)
|
||||
/* populate_peer_ofi_addr
|
||||
* [Desc] This fn does a PMIx Modex recv on "rml.ofi" key
|
||||
* to get the ofi address blob of all providers on the peer.
|
||||
* Then it populates the array parameter peer_ofi_addr[]
|
||||
* Then it populates the array parameter peer_ofi_addr[]
|
||||
* with providername, ofi_ep_name and ofi_ep_namelen
|
||||
* [in] peer -> peer address
|
||||
* [out] peer_ofi_addr[] -> array to hold the provider details on the peer
|
||||
@ -451,18 +452,18 @@ static int populate_peer_ofi_addr(orte_process_name_t *peer, orte_rml_ofi_peer_t
|
||||
}
|
||||
|
||||
|
||||
/* check_provider_in_peer(prov_name, peer_ofi_addr)
|
||||
/* check_provider_in_peer(prov_name, peer_ofi_addr)
|
||||
* [Desc] This fn checks for a match of prov_name in the peer_ofi_addr array
|
||||
* and returns the index of the match or OPAL_ERROR if not found.
|
||||
* The peer_ofi_addr array has all the ofi providers in peer.
|
||||
* [in] prov_name -> The provider name we want to use to send this message to peer.
|
||||
* [in] tot_prov -> total provider entries in array
|
||||
* [in] peer_ofi_addr[] -> array of provider details on the peer
|
||||
* [in] local_ofi_prov_idx -> the index of local provider we are comparing with
|
||||
* [in] local_ofi_prov_idx -> the index of local provider we are comparing with
|
||||
* (index into orte_rml_ofi.ofi_prov[] array.
|
||||
* [Return value] -> index that matches provider on success. OPAL_ERROR if no match found.
|
||||
*/
|
||||
static int check_provider_in_peer( char *prov_name, int tot_prov, orte_rml_ofi_peer_t *peer_ofi_addr, int local_ofi_prov_idx )
|
||||
static int check_provider_in_peer( char *prov_name, int tot_prov, orte_rml_ofi_peer_t *peer_ofi_addr, int local_ofi_prov_idx )
|
||||
{
|
||||
int idx;
|
||||
int ret = OPAL_ERROR;
|
||||
@ -495,7 +496,7 @@ static int check_provider_in_peer( char *prov_name, int tot_prov, orte_rml_ofi_p
|
||||
} else {
|
||||
ret = idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@ -519,7 +520,7 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
orte_rml_ofi_peer_t* pr;
|
||||
uint64_t ui64;
|
||||
struct sockaddr_in* ep_sockaddr;
|
||||
|
||||
|
||||
snd = OBJ_NEW(orte_rml_send_t);
|
||||
snd->dst = *peer;
|
||||
snd->origin = *ORTE_PROC_MY_NAME;
|
||||
@ -565,19 +566,19 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
/* decide the provider we want to use from the list of providers in peer as per below order.
|
||||
* 1. if the user specified the transport for this conduit (even giving us a prioritized list of candidates),
|
||||
* then the one we selected is the _only_ one we will use. If the remote peer has a matching endpoint,
|
||||
* 1. if the user specified the transport for this conduit (even giving us a prioritized list of candidates),
|
||||
* then the one we selected is the _only_ one we will use. If the remote peer has a matching endpoint,
|
||||
* then we use it - otherwise, we error out
|
||||
* 2. if the user did not specify a transport, then we look for matches against _all_ of
|
||||
* 2. if the user did not specify a transport, then we look for matches against _all_ of
|
||||
* our available transports, starting with fabric and then going to Ethernet, taking the first one that matches.
|
||||
* 3. if we cannot find any match, then we error out
|
||||
*/
|
||||
if ( true == user_override() ) {
|
||||
/*case 1. User has specified the provider, find a match in peer for the current selected provider or error out*/
|
||||
/*case 1. User has specified the provider, find a match in peer for the current selected provider or error out*/
|
||||
opal_output_verbose(1, orte_rml_base_framework.framework_output,
|
||||
"%s rml:ofi::send_msg() Case1. looking for a match for current provider",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
if( OPAL_ERROR == ( peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
if( OPAL_ERROR == ( peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[ofi_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
tot_peer_prov, peer_ofi_addr, ofi_prov_id ) )) {
|
||||
opal_output_verbose(1, orte_rml_base_framework.framework_output,
|
||||
"%s rml:ofi::send_msg() Peer is Unreachable - no common ofi provider ",
|
||||
@ -595,8 +596,8 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) {
|
||||
if( 0 != strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) {
|
||||
peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
tot_peer_prov, peer_ofi_addr, cur_prov_id );
|
||||
peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
tot_peer_prov, peer_ofi_addr, cur_prov_id );
|
||||
if (OPAL_ERROR != peer_prov_id) {
|
||||
peer_match_found = true;
|
||||
ofi_prov_id = cur_prov_id;
|
||||
@ -609,7 +610,7 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
for(int cur_prov_id=0; cur_prov_id < orte_rml_ofi.ofi_prov_open_num && !peer_match_found ; cur_prov_id++) {
|
||||
if( 0 == strcmp( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name, "sockets" ) ) {
|
||||
peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
peer_prov_id = check_provider_in_peer( orte_rml_ofi.ofi_prov[cur_prov_id].fabric_info->fabric_attr->prov_name,
|
||||
tot_peer_prov, peer_ofi_addr, cur_prov_id );
|
||||
if (OPAL_ERROR != peer_prov_id) {
|
||||
peer_match_found = true;
|
||||
@ -628,15 +629,15 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
return ;
|
||||
}
|
||||
}
|
||||
/* creating a copy of the chosen provider to put it in hashtable
|
||||
* as the ofi_peer_addr array is local */
|
||||
/* creating a copy of the chosen provider to put it in hashtable
|
||||
* as the ofi_peer_addr array is local */
|
||||
pr = OBJ_NEW(orte_rml_ofi_peer_t);
|
||||
pr->ofi_ep_len = peer_ofi_addr[peer_prov_id].ofi_ep_len;
|
||||
pr->ofi_ep = malloc(pr->ofi_ep_len);
|
||||
memcpy(pr->ofi_ep,peer_ofi_addr[peer_prov_id].ofi_ep,pr->ofi_ep_len);
|
||||
pr->ofi_prov_name = strdup(peer_ofi_addr[peer_prov_id].ofi_prov_name);
|
||||
pr->src_prov_id = ofi_prov_id;
|
||||
if(OPAL_SUCCESS !=
|
||||
if(OPAL_SUCCESS !=
|
||||
(rc = opal_hash_table_set_value_uint64(&orte_rml_ofi.peers, ui64, (void*)pr))) {
|
||||
opal_output_verbose(15, orte_rml_base_framework.framework_output,
|
||||
"%s: ofi address insertion into hash table failed for peer %s ",
|
||||
@ -653,7 +654,7 @@ static void send_msg(int fd, short args, void *cbdata)
|
||||
opal_output_verbose(1, orte_rml_base_framework.framework_output,
|
||||
"%s rml:ofi: OFI peer contact info got from hash table",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
dest_ep_name = pr->ofi_ep;
|
||||
dest_ep_name = pr->ofi_ep;
|
||||
dest_ep_namelen = pr->ofi_ep_len;
|
||||
ofi_prov_id = pr->src_prov_id;
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user