Add a MCA parameter for the ELAN MAP ID file.
Fix small memory bugs, and track the final segfault. Still some ork to do. This commit was SVN r16117.
Этот коммит содержится в:
родитель
a1f5312afb
Коммит
617ff3a413
@ -70,20 +70,21 @@ mca_btl_elan_module_t mca_btl_elan_module = {
|
||||
extern char** environ;
|
||||
|
||||
int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable )
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable )
|
||||
{
|
||||
mca_btl_elan_module_t* elan_btl = (mca_btl_elan_module_t*)btl;
|
||||
int i, rc;
|
||||
FILE* file;
|
||||
char* filename;
|
||||
ELAN_BASE * base;
|
||||
ELAN_STATE * state;
|
||||
ELAN_QUEUE * q= NULL;
|
||||
ELAN_TPORT * p= NULL;
|
||||
ELAN_BASE *base;
|
||||
ELAN_STATE *state;
|
||||
ELAN_QUEUE *q = NULL;
|
||||
ELAN_TPORT *p = NULL;
|
||||
|
||||
/* Create the mapid file in the temporary storage */
|
||||
filename = opal_os_path( false, orte_process_info.proc_session_dir, "ELAN_ID", NULL );
|
||||
file = fopen( filename, "w" );
|
||||
for( i = 0; i < (int)nprocs; i++ ) {
|
||||
@ -91,22 +92,26 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
fprintf( file, "%s %d\n", ompi_proc->proc_hostname, i );
|
||||
}
|
||||
fclose( file );
|
||||
|
||||
/* Set the environment before firing up the Elan library */
|
||||
opal_setenv( "LIBELAN_MACHINES_FILE", filename, true, &environ );
|
||||
/* opal_setenv( "LIBELAN_MACHINES_FILE", "/home/tma/machinefile", false, &environ );*/
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", "/etc/elanidmap", false, &environ );
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", mca_btl_elan_component.elanidmap_file,
|
||||
false, &environ );
|
||||
|
||||
base = elan_baseInit(0);
|
||||
if (base == NULL)
|
||||
if( NULL == base )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
state = base->state;
|
||||
if( NULL == state ) {
|
||||
mca_btl_base_error_no_nics( "ELAN", "Quadrics" );
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_gsync(base->allGroup);
|
||||
if ((q = elan_allocQueue(base->state)) == NULL) {
|
||||
|
||||
/* Create the global queue (it's a synchronization point) */
|
||||
if( (q = elan_gallocQueue(base, base->allGroup)) == NULL ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (!(p = elan_tportInit(base->state,
|
||||
if( !(p = elan_tportInit(base->state,
|
||||
(ELAN_QUEUE *)q,
|
||||
base->tport_nslots,
|
||||
base->tport_smallmsg,
|
||||
@ -117,7 +122,7 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
&base->shm_key,
|
||||
base->shm_fifodepth,
|
||||
base->shm_fragsize,
|
||||
0))) {
|
||||
ELAN_TPORT_SHM_DISABLE | ELAN_TPORT_USERCOPY_DISABLE))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_btl->base = base;
|
||||
@ -126,29 +131,35 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
elan_btl->tport = p;
|
||||
elan_btl->elan_vp = state->vp;
|
||||
elan_btl->elan_nvp = state->nvp;
|
||||
|
||||
for(i = 0; i < (int) nprocs; i++) {
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_elan_proc_t* elan_proc;
|
||||
mca_btl_base_endpoint_t* elan_endpoint;
|
||||
|
||||
/* Don't use Elan for local communications */
|
||||
if( ompi_proc_local_proc == ompi_proc )
|
||||
continue;
|
||||
|
||||
if(NULL == (elan_proc = mca_btl_elan_proc_create(ompi_proc))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
OPAL_THREAD_LOCK(&elan_proc->proc_lock);
|
||||
elan_endpoint = OBJ_NEW(mca_btl_elan_endpoint_t);
|
||||
if(NULL == elan_endpoint) {
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_endpoint->endpoint_btl = elan_btl;
|
||||
|
||||
OPAL_THREAD_LOCK(&elan_proc->proc_lock);
|
||||
rc = mca_btl_elan_proc_insert(elan_proc, elan_endpoint);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(elan_endpoint);
|
||||
OBJ_RELEASE(elan_proc);
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
ompi_bitmap_set_bit(reachable, i);
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
peers[i] = elan_endpoint;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
@ -168,9 +179,9 @@ int mca_btl_elan_del_procs( struct mca_btl_base_module_t* btl,
|
||||
*/
|
||||
|
||||
int mca_btl_elan_register( struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata )
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata )
|
||||
{
|
||||
mca_btl_elan_module_t* elan_btl = (mca_btl_elan_module_t*) btl;
|
||||
void * tbuf = NULL;
|
||||
@ -180,24 +191,24 @@ int mca_btl_elan_register( struct mca_btl_base_module_t* btl,
|
||||
elan_btl->elan_reg[tag].cbfunc = cbfunc;
|
||||
elan_btl->elan_reg[tag].cbdata = cbdata;
|
||||
if (NULL != cbfunc) {
|
||||
/* Post the receives if there is no unexpected handler */
|
||||
MCA_BTL_TEMPLATE_FRAG_ALLOC_EAGER(frag, rc );
|
||||
if( NULL == frag ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
frag->base.des_dst = &(frag->segment);
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.des_src = NULL;
|
||||
frag->base.des_src_cnt = 0;
|
||||
frag->tag = tag;
|
||||
frag->type = MCA_BTL_ELAN_HDR_TYPE_RECV;
|
||||
tbuf = (void*)(frag+1);
|
||||
send_len = elan_btl->super.btl_eager_limit;
|
||||
desc = (bufdesc_t * )malloc (sizeof(struct bufdesc_t));
|
||||
desc->eve = elan_tportRxStart (elan_btl->tport,0 , 0, 0, 0xffffffff, frag->tag, tbuf,send_len) ;
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
BTL_ELAN_ADD_TO_FIFO(elan_btl, desc);
|
||||
/* Post the receives if there is no unexpected handler */
|
||||
MCA_BTL_TEMPLATE_FRAG_ALLOC_EAGER(frag, rc );
|
||||
if( NULL == frag ) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
frag->base.des_dst = &(frag->segment);
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.des_src = NULL;
|
||||
frag->base.des_src_cnt = 0;
|
||||
frag->tag = tag;
|
||||
frag->type = MCA_BTL_ELAN_HDR_TYPE_RECV;
|
||||
tbuf = (void*)(frag+1);
|
||||
send_len = elan_btl->super.btl_eager_limit;
|
||||
desc = (bufdesc_t * )malloc (sizeof(struct bufdesc_t));
|
||||
desc->eve = elan_tportRxStart (elan_btl->tport,0 , 0, 0, 0xffffffff, frag->tag, tbuf,send_len) ;
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
BTL_ELAN_ADD_TO_FIFO(elan_btl, desc);
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -216,6 +227,7 @@ mca_btl_base_descriptor_t* mca_btl_elan_alloc(struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_elan_frag_t* frag;
|
||||
int rc;
|
||||
|
||||
if(size <= btl->btl_eager_limit){
|
||||
MCA_BTL_TEMPLATE_FRAG_ALLOC_EAGER(frag, rc);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
@ -416,10 +428,11 @@ int mca_btl_elan_send( struct mca_btl_base_module_t* btl,
|
||||
frag->type = MCA_BTL_ELAN_HDR_TYPE_SEND;
|
||||
peer = endpoint->elan_vp;
|
||||
proc = elan_btl->elan_vp;
|
||||
sbuf = (void *)frag->base.des_src->seg_addr.pval;
|
||||
send_len = frag->base.des_src->seg_len;
|
||||
sbuf = (void *)frag->base.des_src->seg_addr.pval;
|
||||
send_len = frag->base.des_src->seg_len;
|
||||
desc = (bufdesc_t * )malloc (sizeof(struct bufdesc_t));
|
||||
desc->eve = elan_tportTxStart (elan_btl->tport, 0, peer, proc,frag->tag, sbuf, send_len) ;
|
||||
desc->eve = elan_tportTxStart (elan_btl->tport, 0, peer, proc,frag->tag,
|
||||
sbuf, send_len) ;
|
||||
/*opal_output( 0, "send message startoing from %d to %d\n", proc, peer );*/
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
@ -503,15 +516,13 @@ int mca_btl_elan_get( mca_btl_base_module_t* btl,
|
||||
void cancel_elanRx(mca_btl_elan_module_t* elan_btl)
|
||||
{
|
||||
bufdesc_t * index = elan_btl->tportFIFOHead;
|
||||
while(index!= NULL)
|
||||
{
|
||||
if(index->frag->type == MCA_BTL_ELAN_HDR_TYPE_RECV)
|
||||
{
|
||||
|
||||
if(elan_tportRxCancel(index->eve))
|
||||
MCA_BTL_TEMPLATE_FRAG_RETURN(index->frag);
|
||||
|
||||
while( NULL != index ) {
|
||||
if( index->frag->type == MCA_BTL_ELAN_HDR_TYPE_RECV ) {
|
||||
if( elan_tportRxCancel(index->eve) ) {
|
||||
MCA_BTL_TEMPLATE_FRAG_RETURN(index->frag);
|
||||
}
|
||||
}
|
||||
index = index->next;
|
||||
}
|
||||
}
|
||||
@ -519,8 +530,13 @@ void cancel_elanRx(mca_btl_elan_module_t* elan_btl)
|
||||
int mca_btl_elan_finalize( struct mca_btl_base_module_t* btl )
|
||||
{
|
||||
mca_btl_elan_module_t* elan_btl = (mca_btl_elan_module_t*) btl;
|
||||
|
||||
OBJ_DESTRUCT(&elan_btl->elan_lock);
|
||||
|
||||
cancel_elanRx(elan_btl);
|
||||
/* disable the network */
|
||||
elan_disable_network( elan_btl->state );
|
||||
|
||||
free(elan_btl);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -90,6 +90,8 @@ struct mca_btl_elan_component_t {
|
||||
char* elan_mpool_name;
|
||||
/**< name of memory pool */
|
||||
|
||||
char* elanidmap_file; /**< name of the ELANIDMAP file */
|
||||
|
||||
bool leave_pinned;
|
||||
/**< pin memory on first use and leave pinned */
|
||||
|
||||
|
@ -98,8 +98,8 @@ static inline int mca_btl_elan_param_register_int(
|
||||
int mca_btl_elan_component_open(void)
|
||||
{
|
||||
/* initialize state */
|
||||
mca_btl_elan_component.elan_num_btls=0;
|
||||
mca_btl_elan_component.elan_btls=NULL;
|
||||
mca_btl_elan_component.elan_num_btls = 0;
|
||||
mca_btl_elan_component.elan_btls = NULL;
|
||||
|
||||
/* register Elan4 component parameters */
|
||||
mca_btl_elan_component.elan_free_list_num =
|
||||
@ -123,6 +123,9 @@ int mca_btl_elan_component_open(void)
|
||||
mca_btl_elan_module.super.btl_latency = 5;
|
||||
mca_btl_base_param_register(&mca_btl_elan_component.super.btl_version,
|
||||
&mca_btl_elan_module.super);
|
||||
|
||||
mca_btl_elan_component.elanidmap_file =
|
||||
mca_btl_elan_param_register_string( "elanidmap", "/etc/elanidmap" );
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -135,8 +138,10 @@ int mca_btl_elan_component_close(void)
|
||||
{
|
||||
if( NULL != mca_btl_elan_component.elan_btls ) {
|
||||
free( mca_btl_elan_component.elan_btls );
|
||||
/* release resources */
|
||||
mca_btl_elan_component.elan_btls = NULL;
|
||||
mca_btl_elan_component.elan_num_btls = 0;
|
||||
|
||||
/* release resources */
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_procs);
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_eager);
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_user);
|
||||
@ -196,20 +201,21 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
NULL ); /* use default allocator */
|
||||
|
||||
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", "/etc/elanidmap", false, &environ );
|
||||
vpid = orte_process_info.my_name->vpid;
|
||||
|
||||
ompi_modex_send( &mca_btl_elan_component.super.btl_version, &vpid, sizeof(vpid));
|
||||
ompi_modex_send( &mca_btl_elan_component.super.btl_version, &vpid,
|
||||
sizeof(vpid));
|
||||
|
||||
mca_btl_elan_component.elan_num_btls = 1;
|
||||
mca_btl_elan_component.elan_btls = malloc( (mca_btl_elan_component.elan_num_btls) * sizeof(mca_btl_base_module_t*) );
|
||||
mca_btl_elan_component.elan_btls = malloc( mca_btl_elan_component.elan_num_btls * sizeof(mca_btl_base_module_t*) );
|
||||
for( i = count = 0; i < mca_btl_elan_component.elan_num_btls; i++ ) {
|
||||
mca_btl_elan_module_t* btl = malloc (sizeof (mca_btl_elan_module_t));
|
||||
if(NULL == btl)
|
||||
continue;
|
||||
memcpy( btl, &mca_btl_elan_module, sizeof(mca_btl_elan_module_t) );
|
||||
OBJ_CONSTRUCT (&btl->elan_lock, opal_mutex_t);
|
||||
btl->tportFIFOHead=NULL;
|
||||
btl->tportFIFOTail=NULL;
|
||||
btl->tportFIFOHead = NULL;
|
||||
btl->tportFIFOTail = NULL;
|
||||
mca_btl_elan_component.elan_btls[count++] = btl;
|
||||
}
|
||||
mca_btl_elan_component.elan_num_btls = count ;
|
||||
@ -219,7 +225,8 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
mca_btl_elan_component.elan_num_btls = 0; /* no active BTL modules */
|
||||
return NULL;
|
||||
}
|
||||
memcpy( btls, mca_btl_elan_component.elan_btls, mca_btl_elan_component.elan_num_btls *sizeof(mca_btl_elan_module_t*) );
|
||||
memcpy( btls, mca_btl_elan_component.elan_btls,
|
||||
mca_btl_elan_component.elan_num_btls *sizeof(mca_btl_elan_module_t*) );
|
||||
*num_btl_modules = mca_btl_elan_component.elan_num_btls;
|
||||
return btls;
|
||||
}
|
||||
@ -227,11 +234,7 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
/*
|
||||
* Elan4 component progress.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
int mca_btl_elan_component_progress()
|
||||
int mca_btl_elan_component_progress( void )
|
||||
{
|
||||
size_t num_progressed = 0, i, no_btls, size;
|
||||
mca_btl_elan_frag_t* frag;
|
||||
@ -245,59 +248,51 @@ int mca_btl_elan_component_progress()
|
||||
if(desc ==NULL)
|
||||
continue;
|
||||
frag = (mca_btl_elan_frag_t*) desc->frag;
|
||||
if(frag!=NULL)
|
||||
{
|
||||
if(frag->type== MCA_BTL_ELAN_HDR_TYPE_SEND )
|
||||
{
|
||||
/* it's a send */
|
||||
/* call the completion callback */
|
||||
elan_tportTxWait(desc->eve);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
|
||||
}
|
||||
else if(frag->type== MCA_BTL_ELAN_HDR_TYPE_PUT || frag->type== MCA_BTL_ELAN_HDR_TYPE_GET )
|
||||
{
|
||||
/* it's a put*/
|
||||
/* call the completion callback */
|
||||
elan_wait(desc->eve,ELAN_WAIT_EVENT);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
}
|
||||
else{
|
||||
/* and this one is a receive */
|
||||
mca_btl_base_recv_reg_t* reg;
|
||||
reg = &(elan_btl->elan_reg[frag->tag]);
|
||||
elan_tportRxWait(desc->eve, NULL, NULL, &size);
|
||||
frag->base.des_dst->seg_len = size;
|
||||
reg->cbfunc( &(elan_btl->super), frag->tag, &(frag->base),reg->cbdata );
|
||||
/**
|
||||
* The upper level extract the data from the fragment.
|
||||
* Now we can register the fragment
|
||||
* again with the elan BTL.
|
||||
*/
|
||||
desc->eve = elan_tportRxStart (elan_btl->tport, 0 , 0, 0, 0xffffffff, frag->tag, frag->base.des_dst->seg_addr.pval, mca_btl_elan_module.super.btl_eager_limit) ;
|
||||
/*desc->eve = elan_tportRxStart (elan_btl->tport, ELAN_TPORT_RXANY , 0, 0, 0, 0, frag->base.des_dst->seg_addr.pval, mca_btl_elan_module.super.btl_eager_limit) ;*/
|
||||
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
|
||||
if(elan_btl->tportFIFOTail)
|
||||
{
|
||||
elan_btl->tportFIFOTail->next = desc;
|
||||
elan_btl->tportFIFOTail=desc;
|
||||
}
|
||||
else{
|
||||
elan_btl->tportFIFOHead = desc;
|
||||
elan_btl->tportFIFOTail = desc;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
||||
}
|
||||
if( NULL != frag ) {
|
||||
if(frag->type== MCA_BTL_ELAN_HDR_TYPE_SEND ) {
|
||||
/* it's a send */
|
||||
/* call the completion callback */
|
||||
elan_tportTxWait(desc->eve);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
} else if( (frag->type == MCA_BTL_ELAN_HDR_TYPE_PUT) ||
|
||||
(frag->type== MCA_BTL_ELAN_HDR_TYPE_GET) ) {
|
||||
/* it's a put*/
|
||||
/* call the completion callback */
|
||||
elan_wait(desc->eve,ELAN_WAIT_EVENT);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
} else {
|
||||
opal_output( 0, "Something bad happened the frag == NULL\n" );
|
||||
/* and this one is a receive */
|
||||
mca_btl_base_recv_reg_t* reg;
|
||||
reg = &(elan_btl->elan_reg[frag->tag]);
|
||||
elan_tportRxWait(desc->eve, NULL, NULL, &size);
|
||||
frag->base.des_dst->seg_len = size;
|
||||
reg->cbfunc( &(elan_btl->super), frag->tag, &(frag->base),reg->cbdata );
|
||||
/**
|
||||
* The upper level extract the data from the fragment.
|
||||
* Now we can register the fragment
|
||||
* again with the elan BTL.
|
||||
*/
|
||||
desc->eve = elan_tportRxStart (elan_btl->tport, 0 , 0, 0, 0xffffffff, frag->tag, frag->base.des_dst->seg_addr.pval, mca_btl_elan_module.super.btl_eager_limit) ;
|
||||
/*desc->eve = elan_tportRxStart (elan_btl->tport, ELAN_TPORT_RXANY , 0, 0, 0, 0, frag->base.des_dst->seg_addr.pval, mca_btl_elan_module.super.btl_eager_limit) ;*/
|
||||
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
|
||||
if( elan_btl->tportFIFOTail ) {
|
||||
elan_btl->tportFIFOTail->next = desc;
|
||||
elan_btl->tportFIFOTail=desc;
|
||||
} else {
|
||||
elan_btl->tportFIFOHead = desc;
|
||||
elan_btl->tportFIFOTail = desc;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
||||
}
|
||||
} else {
|
||||
opal_output( 0, "Something bad happened the frag == NULL\n" );
|
||||
}
|
||||
num_progressed++;
|
||||
|
||||
}
|
||||
|
||||
return num_progressed;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user