2004-10-25 17:33:30 +04:00
|
|
|
/*
|
2004-11-22 04:38:40 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-10-25 17:33:30 +04:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "include/sys/atomic.h"
|
|
|
|
#include "class/ompi_proc_table.h"
|
2005-07-03 04:52:18 +04:00
|
|
|
#include "mca/pml/base/pml_base_module_exchange.h"
|
2004-10-25 17:33:30 +04:00
|
|
|
#include "ptl_mx.h"
|
|
|
|
#include "ptl_mx_peer.h"
|
|
|
|
#include "ptl_mx_proc.h"
|
|
|
|
|
|
|
|
|
|
|
|
static void mca_ptl_mx_proc_construct(mca_ptl_mx_proc_t* proc);
|
|
|
|
static void mca_ptl_mx_proc_destruct(mca_ptl_mx_proc_t* proc);
|
|
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_ptl_mx_proc_t,
|
2005-07-03 20:22:16 +04:00
|
|
|
opal_list_item_t,
|
2004-10-25 17:33:30 +04:00
|
|
|
mca_ptl_mx_proc_construct,
|
|
|
|
mca_ptl_mx_proc_destruct
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize mx proc instance
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_ptl_mx_proc_construct(mca_ptl_mx_proc_t* proc)
|
|
|
|
{
|
|
|
|
proc->proc_ompi = NULL;
|
|
|
|
proc->proc_addrs = NULL;
|
|
|
|
proc->proc_addr_count = 0;
|
|
|
|
proc->proc_peers = NULL;
|
|
|
|
proc->proc_peer_count = 0;
|
2005-07-04 02:45:48 +04:00
|
|
|
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
|
2004-10-25 17:33:30 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cleanup mx proc instance
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_ptl_mx_proc_destruct(mca_ptl_mx_proc_t* proc)
|
|
|
|
{
|
|
|
|
/* remove from list of all proc instances */
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_ptl_mx_component.mx_lock);
|
2005-07-03 20:52:32 +04:00
|
|
|
opal_hash_table_remove_proc(&mca_ptl_mx_component.mx_procs, &proc->proc_name);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_ptl_mx_component.mx_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
|
|
|
|
/* release resources */
|
|
|
|
if(NULL != proc->proc_peers)
|
|
|
|
free(proc->proc_peers);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a MX process structure. There is a one-to-one correspondence
|
|
|
|
* between a ompi_proc_t and a mca_ptl_mx_proc_t instance. We cache additional
|
|
|
|
* data (specifically the list of mca_ptl_mx_peer_t instances, and publiched
|
|
|
|
* addresses) associated w/ a given destination on this datastructure.
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_ptl_mx_proc_t* mca_ptl_mx_proc_create(ompi_proc_t* ompi_proc)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
size_t size;
|
|
|
|
mca_ptl_mx_proc_t* ptl_proc;
|
|
|
|
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_ptl_mx_component.mx_lock);
|
2005-07-03 20:52:32 +04:00
|
|
|
ptl_proc = (mca_ptl_mx_proc_t*)opal_hash_table_get_proc(
|
2004-10-25 17:33:30 +04:00
|
|
|
&mca_ptl_mx_component.mx_procs, &ompi_proc->proc_name);
|
|
|
|
if(NULL != ptl_proc) {
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_ptl_mx_component.mx_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
return ptl_proc;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptl_proc = OBJ_NEW(mca_ptl_mx_proc_t);
|
|
|
|
if(NULL == ptl_proc)
|
|
|
|
return NULL;
|
|
|
|
ptl_proc->proc_ompi = ompi_proc;
|
|
|
|
ptl_proc->proc_name = ompi_proc->proc_name;
|
|
|
|
|
|
|
|
/* add to hash table of all proc instance */
|
2005-07-03 20:52:32 +04:00
|
|
|
opal_hash_table_set_proc(
|
2004-10-25 17:33:30 +04:00
|
|
|
&mca_ptl_mx_component.mx_procs,
|
|
|
|
&ptl_proc->proc_name,
|
|
|
|
ptl_proc);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_ptl_mx_component.mx_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
|
|
|
|
/* lookup mx parameters exported by this proc */
|
|
|
|
rc = mca_base_modex_recv(
|
|
|
|
&mca_ptl_mx_component.super.ptlm_version,
|
|
|
|
ompi_proc,
|
|
|
|
(void**)&ptl_proc->proc_addrs,
|
|
|
|
&size);
|
|
|
|
if(rc != OMPI_SUCCESS) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_ptl_mx_proc_create: mca_base_modex_recv: failed with return value=%d", rc);
|
2004-10-25 17:33:30 +04:00
|
|
|
OBJ_RELEASE(ptl_proc);
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-16 03:38:38 +04:00
|
|
|
if(0 != (size % sizeof(mca_ptl_mx_endpoint_t))) {
|
2005-07-04 03:31:27 +04:00
|
|
|
opal_output(0, "mca_ptl_mx_proc_create: mca_base_modex_recv: invalid size %d\n", size);
|
2004-10-25 17:33:30 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2005-07-16 03:38:38 +04:00
|
|
|
ptl_proc->proc_addr_count = size / sizeof(mca_ptl_mx_endpoint_t);
|
2004-10-25 17:33:30 +04:00
|
|
|
|
|
|
|
/* allocate space for peer array - one for each exported address */
|
|
|
|
ptl_proc->proc_peers = (mca_ptl_mx_peer_t**)
|
|
|
|
malloc(ptl_proc->proc_addr_count * sizeof(mca_ptl_base_peer_t*));
|
|
|
|
if(NULL == ptl_proc->proc_peers) {
|
|
|
|
OBJ_RELEASE(ptl_proc);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
return ptl_proc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look for an existing MX process instance based on the globally unique
|
|
|
|
* process identifier.
|
|
|
|
*/
|
2005-07-06 20:37:48 +04:00
|
|
|
mca_ptl_mx_proc_t* mca_ptl_mx_proc_lookup(const orte_process_name_t *name)
|
2004-10-25 17:33:30 +04:00
|
|
|
{
|
|
|
|
mca_ptl_mx_proc_t* proc;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&mca_ptl_mx_component.mx_lock);
|
2005-07-03 20:52:32 +04:00
|
|
|
proc = (mca_ptl_mx_proc_t*)opal_hash_table_get_proc(
|
2004-10-25 17:33:30 +04:00
|
|
|
&mca_ptl_mx_component.mx_procs, name);
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&mca_ptl_mx_component.mx_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
return proc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that this routine must be called with the lock on the process already
|
|
|
|
* held. Insert a ptl instance into the proc array and assign it an address.
|
|
|
|
*/
|
|
|
|
int mca_ptl_mx_proc_insert(mca_ptl_mx_proc_t* ptl_proc, mca_ptl_base_peer_t* ptl_peer)
|
|
|
|
{
|
2005-07-16 03:38:38 +04:00
|
|
|
mx_return_t mx_status;
|
|
|
|
mca_ptl_mx_endpoint_t* remote = &(ptl_proc->proc_addrs[ptl_proc->proc_peer_count]);
|
|
|
|
int num_retry = 0;
|
2004-10-29 02:23:22 +04:00
|
|
|
/* insert into peer array */
|
2004-10-25 17:33:30 +04:00
|
|
|
ptl_peer->peer_proc = ptl_proc;
|
|
|
|
ptl_proc->proc_peers[ptl_proc->proc_peer_count] = ptl_peer;
|
2004-10-28 19:40:46 +04:00
|
|
|
|
2005-07-16 03:38:38 +04:00
|
|
|
/* construct the remote endpoint addr */
|
|
|
|
retry_connect:
|
|
|
|
mx_status = mx_connect( ptl_peer->peer_ptl->mx_endpoint, remote->nic_id, remote->endpoint_id,
|
|
|
|
mca_ptl_mx_component.mx_filter, 1, &(ptl_peer->peer_addr) );
|
|
|
|
if( OMPI_SUCCESS != mx_status ) {
|
|
|
|
if( MX_TIMEOUT == mx_status )
|
|
|
|
if( num_retry++ < 5 )
|
|
|
|
goto retry_connect;
|
|
|
|
opal_output( 0, "mx_connect fail for peer %d remote %lx %d filter %x with error %s\n",
|
|
|
|
ptl_proc->proc_peer_count,
|
|
|
|
remote->nic_id, remote->endpoint_id, mca_ptl_mx_component.mx_filter,
|
|
|
|
mx_strerror(mx_status) );
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
ptl_proc->proc_peer_count++;
|
2004-10-25 17:33:30 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove a peer from the proc array and indicate the address is
|
|
|
|
* no longer in use.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_ptl_mx_proc_remove(mca_ptl_mx_proc_t* ptl_proc, mca_ptl_base_peer_t* ptl_peer)
|
|
|
|
{
|
|
|
|
size_t i;
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_LOCK(&ptl_proc->proc_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
for(i=0; i<ptl_proc->proc_peer_count; i++) {
|
|
|
|
if(ptl_proc->proc_peers[i] == ptl_peer) {
|
|
|
|
memmove(ptl_proc->proc_peers+i, ptl_proc->proc_peers+i+1,
|
|
|
|
(ptl_proc->proc_peer_count-i-1)*sizeof(mca_ptl_mx_peer_t*));
|
|
|
|
ptl_proc->proc_peer_count--;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-07-04 02:45:48 +04:00
|
|
|
OPAL_THREAD_UNLOCK(&ptl_proc->proc_lock);
|
2004-10-25 17:33:30 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|