Add a MCA parameter for the ELAN MAP ID file.
Fix small memory bugs, and track the final segfault. Still some ork to do. This commit was SVN r16117.
Этот коммит содержится в:
родитель
a1f5312afb
Коммит
617ff3a413
@ -79,11 +79,12 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
int i, rc;
|
||||
FILE* file;
|
||||
char* filename;
|
||||
ELAN_BASE * base;
|
||||
ELAN_STATE * state;
|
||||
ELAN_QUEUE * q= NULL;
|
||||
ELAN_TPORT * p= NULL;
|
||||
ELAN_BASE *base;
|
||||
ELAN_STATE *state;
|
||||
ELAN_QUEUE *q = NULL;
|
||||
ELAN_TPORT *p = NULL;
|
||||
|
||||
/* Create the mapid file in the temporary storage */
|
||||
filename = opal_os_path( false, orte_process_info.proc_session_dir, "ELAN_ID", NULL );
|
||||
file = fopen( filename, "w" );
|
||||
for( i = 0; i < (int)nprocs; i++ ) {
|
||||
@ -91,22 +92,26 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
fprintf( file, "%s %d\n", ompi_proc->proc_hostname, i );
|
||||
}
|
||||
fclose( file );
|
||||
|
||||
/* Set the environment before firing up the Elan library */
|
||||
opal_setenv( "LIBELAN_MACHINES_FILE", filename, true, &environ );
|
||||
/* opal_setenv( "LIBELAN_MACHINES_FILE", "/home/tma/machinefile", false, &environ );*/
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", "/etc/elanidmap", false, &environ );
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", mca_btl_elan_component.elanidmap_file,
|
||||
false, &environ );
|
||||
|
||||
base = elan_baseInit(0);
|
||||
if (base == NULL)
|
||||
if( NULL == base )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
state = base->state;
|
||||
if( NULL == state ) {
|
||||
mca_btl_base_error_no_nics( "ELAN", "Quadrics" );
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_gsync(base->allGroup);
|
||||
if ((q = elan_allocQueue(base->state)) == NULL) {
|
||||
|
||||
/* Create the global queue (it's a synchronization point) */
|
||||
if( (q = elan_gallocQueue(base, base->allGroup)) == NULL ) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (!(p = elan_tportInit(base->state,
|
||||
if( !(p = elan_tportInit(base->state,
|
||||
(ELAN_QUEUE *)q,
|
||||
base->tport_nslots,
|
||||
base->tport_smallmsg,
|
||||
@ -117,7 +122,7 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
&base->shm_key,
|
||||
base->shm_fifodepth,
|
||||
base->shm_fragsize,
|
||||
0))) {
|
||||
ELAN_TPORT_SHM_DISABLE | ELAN_TPORT_USERCOPY_DISABLE))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_btl->base = base;
|
||||
@ -126,29 +131,35 @@ int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
|
||||
elan_btl->tport = p;
|
||||
elan_btl->elan_vp = state->vp;
|
||||
elan_btl->elan_nvp = state->nvp;
|
||||
|
||||
for(i = 0; i < (int) nprocs; i++) {
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_elan_proc_t* elan_proc;
|
||||
mca_btl_base_endpoint_t* elan_endpoint;
|
||||
|
||||
/* Don't use Elan for local communications */
|
||||
if( ompi_proc_local_proc == ompi_proc )
|
||||
continue;
|
||||
|
||||
if(NULL == (elan_proc = mca_btl_elan_proc_create(ompi_proc))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
OPAL_THREAD_LOCK(&elan_proc->proc_lock);
|
||||
elan_endpoint = OBJ_NEW(mca_btl_elan_endpoint_t);
|
||||
if(NULL == elan_endpoint) {
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
elan_endpoint->endpoint_btl = elan_btl;
|
||||
|
||||
OPAL_THREAD_LOCK(&elan_proc->proc_lock);
|
||||
rc = mca_btl_elan_proc_insert(elan_proc, elan_endpoint);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
OBJ_RELEASE(elan_endpoint);
|
||||
OBJ_RELEASE(elan_proc);
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
ompi_bitmap_set_bit(reachable, i);
|
||||
OPAL_THREAD_UNLOCK(&elan_proc->proc_lock);
|
||||
peers[i] = elan_endpoint;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
@ -216,6 +227,7 @@ mca_btl_base_descriptor_t* mca_btl_elan_alloc(struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_elan_frag_t* frag;
|
||||
int rc;
|
||||
|
||||
if(size <= btl->btl_eager_limit){
|
||||
MCA_BTL_TEMPLATE_FRAG_ALLOC_EAGER(frag, rc);
|
||||
if( OPAL_UNLIKELY(NULL == frag) ) {
|
||||
@ -419,7 +431,8 @@ int mca_btl_elan_send( struct mca_btl_base_module_t* btl,
|
||||
sbuf = (void *)frag->base.des_src->seg_addr.pval;
|
||||
send_len = frag->base.des_src->seg_len;
|
||||
desc = (bufdesc_t * )malloc (sizeof(struct bufdesc_t));
|
||||
desc->eve = elan_tportTxStart (elan_btl->tport, 0, peer, proc,frag->tag, sbuf, send_len) ;
|
||||
desc->eve = elan_tportTxStart (elan_btl->tport, 0, peer, proc,frag->tag,
|
||||
sbuf, send_len) ;
|
||||
/*opal_output( 0, "send message startoing from %d to %d\n", proc, peer );*/
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
@ -503,14 +516,12 @@ int mca_btl_elan_get( mca_btl_base_module_t* btl,
|
||||
void cancel_elanRx(mca_btl_elan_module_t* elan_btl)
|
||||
{
|
||||
bufdesc_t * index = elan_btl->tportFIFOHead;
|
||||
while(index!= NULL)
|
||||
{
|
||||
if(index->frag->type == MCA_BTL_ELAN_HDR_TYPE_RECV)
|
||||
{
|
||||
|
||||
if(elan_tportRxCancel(index->eve))
|
||||
while( NULL != index ) {
|
||||
if( index->frag->type == MCA_BTL_ELAN_HDR_TYPE_RECV ) {
|
||||
if( elan_tportRxCancel(index->eve) ) {
|
||||
MCA_BTL_TEMPLATE_FRAG_RETURN(index->frag);
|
||||
|
||||
}
|
||||
}
|
||||
index = index->next;
|
||||
}
|
||||
@ -519,8 +530,13 @@ void cancel_elanRx(mca_btl_elan_module_t* elan_btl)
|
||||
int mca_btl_elan_finalize( struct mca_btl_base_module_t* btl )
|
||||
{
|
||||
mca_btl_elan_module_t* elan_btl = (mca_btl_elan_module_t*) btl;
|
||||
|
||||
OBJ_DESTRUCT(&elan_btl->elan_lock);
|
||||
|
||||
cancel_elanRx(elan_btl);
|
||||
/* disable the network */
|
||||
elan_disable_network( elan_btl->state );
|
||||
|
||||
free(elan_btl);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -90,6 +90,8 @@ struct mca_btl_elan_component_t {
|
||||
char* elan_mpool_name;
|
||||
/**< name of memory pool */
|
||||
|
||||
char* elanidmap_file; /**< name of the ELANIDMAP file */
|
||||
|
||||
bool leave_pinned;
|
||||
/**< pin memory on first use and leave pinned */
|
||||
|
||||
|
@ -98,8 +98,8 @@ static inline int mca_btl_elan_param_register_int(
|
||||
int mca_btl_elan_component_open(void)
|
||||
{
|
||||
/* initialize state */
|
||||
mca_btl_elan_component.elan_num_btls=0;
|
||||
mca_btl_elan_component.elan_btls=NULL;
|
||||
mca_btl_elan_component.elan_num_btls = 0;
|
||||
mca_btl_elan_component.elan_btls = NULL;
|
||||
|
||||
/* register Elan4 component parameters */
|
||||
mca_btl_elan_component.elan_free_list_num =
|
||||
@ -124,6 +124,9 @@ int mca_btl_elan_component_open(void)
|
||||
mca_btl_base_param_register(&mca_btl_elan_component.super.btl_version,
|
||||
&mca_btl_elan_module.super);
|
||||
|
||||
mca_btl_elan_component.elanidmap_file =
|
||||
mca_btl_elan_param_register_string( "elanidmap", "/etc/elanidmap" );
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -135,8 +138,10 @@ int mca_btl_elan_component_close(void)
|
||||
{
|
||||
if( NULL != mca_btl_elan_component.elan_btls ) {
|
||||
free( mca_btl_elan_component.elan_btls );
|
||||
/* release resources */
|
||||
mca_btl_elan_component.elan_btls = NULL;
|
||||
mca_btl_elan_component.elan_num_btls = 0;
|
||||
|
||||
/* release resources */
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_procs);
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_eager);
|
||||
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_user);
|
||||
@ -196,20 +201,21 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
NULL ); /* use default allocator */
|
||||
|
||||
|
||||
opal_setenv( "MPIRUN_ELANIDMAP_FILE", "/etc/elanidmap", false, &environ );
|
||||
vpid = orte_process_info.my_name->vpid;
|
||||
|
||||
ompi_modex_send( &mca_btl_elan_component.super.btl_version, &vpid, sizeof(vpid));
|
||||
ompi_modex_send( &mca_btl_elan_component.super.btl_version, &vpid,
|
||||
sizeof(vpid));
|
||||
|
||||
mca_btl_elan_component.elan_num_btls = 1;
|
||||
mca_btl_elan_component.elan_btls = malloc( (mca_btl_elan_component.elan_num_btls) * sizeof(mca_btl_base_module_t*) );
|
||||
mca_btl_elan_component.elan_btls = malloc( mca_btl_elan_component.elan_num_btls * sizeof(mca_btl_base_module_t*) );
|
||||
for( i = count = 0; i < mca_btl_elan_component.elan_num_btls; i++ ) {
|
||||
mca_btl_elan_module_t* btl = malloc (sizeof (mca_btl_elan_module_t));
|
||||
if(NULL == btl)
|
||||
continue;
|
||||
memcpy( btl, &mca_btl_elan_module, sizeof(mca_btl_elan_module_t) );
|
||||
OBJ_CONSTRUCT (&btl->elan_lock, opal_mutex_t);
|
||||
btl->tportFIFOHead=NULL;
|
||||
btl->tportFIFOTail=NULL;
|
||||
btl->tportFIFOHead = NULL;
|
||||
btl->tportFIFOTail = NULL;
|
||||
mca_btl_elan_component.elan_btls[count++] = btl;
|
||||
}
|
||||
mca_btl_elan_component.elan_num_btls = count ;
|
||||
@ -219,7 +225,8 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
mca_btl_elan_component.elan_num_btls = 0; /* no active BTL modules */
|
||||
return NULL;
|
||||
}
|
||||
memcpy( btls, mca_btl_elan_component.elan_btls, mca_btl_elan_component.elan_num_btls *sizeof(mca_btl_elan_module_t*) );
|
||||
memcpy( btls, mca_btl_elan_component.elan_btls,
|
||||
mca_btl_elan_component.elan_num_btls *sizeof(mca_btl_elan_module_t*) );
|
||||
*num_btl_modules = mca_btl_elan_component.elan_num_btls;
|
||||
return btls;
|
||||
}
|
||||
@ -227,11 +234,7 @@ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules,
|
||||
/*
|
||||
* Elan4 component progress.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
int mca_btl_elan_component_progress()
|
||||
int mca_btl_elan_component_progress( void )
|
||||
{
|
||||
size_t num_progressed = 0, i, no_btls, size;
|
||||
mca_btl_elan_frag_t* frag;
|
||||
@ -245,26 +248,21 @@ int mca_btl_elan_component_progress()
|
||||
if(desc ==NULL)
|
||||
continue;
|
||||
frag = (mca_btl_elan_frag_t*) desc->frag;
|
||||
if(frag!=NULL)
|
||||
{
|
||||
if(frag->type== MCA_BTL_ELAN_HDR_TYPE_SEND )
|
||||
{
|
||||
if( NULL != frag ) {
|
||||
if(frag->type== MCA_BTL_ELAN_HDR_TYPE_SEND ) {
|
||||
/* it's a send */
|
||||
/* call the completion callback */
|
||||
elan_tportTxWait(desc->eve);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
|
||||
}
|
||||
else if(frag->type== MCA_BTL_ELAN_HDR_TYPE_PUT || frag->type== MCA_BTL_ELAN_HDR_TYPE_GET )
|
||||
{
|
||||
} else if( (frag->type == MCA_BTL_ELAN_HDR_TYPE_PUT) ||
|
||||
(frag->type== MCA_BTL_ELAN_HDR_TYPE_GET) ) {
|
||||
/* it's a put*/
|
||||
/* call the completion callback */
|
||||
elan_wait(desc->eve,ELAN_WAIT_EVENT);
|
||||
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS );
|
||||
free(desc);
|
||||
}
|
||||
else{
|
||||
} else {
|
||||
/* and this one is a receive */
|
||||
mca_btl_base_recv_reg_t* reg;
|
||||
reg = &(elan_btl->elan_reg[frag->tag]);
|
||||
@ -282,12 +280,10 @@ int mca_btl_elan_component_progress()
|
||||
desc->frag = frag;
|
||||
desc->next = NULL;
|
||||
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
|
||||
if(elan_btl->tportFIFOTail)
|
||||
{
|
||||
if( elan_btl->tportFIFOTail ) {
|
||||
elan_btl->tportFIFOTail->next = desc;
|
||||
elan_btl->tportFIFOTail=desc;
|
||||
}
|
||||
else{
|
||||
} else {
|
||||
elan_btl->tportFIFOHead = desc;
|
||||
elan_btl->tportFIFOTail = desc;
|
||||
}
|
||||
@ -297,7 +293,6 @@ int mca_btl_elan_component_progress()
|
||||
opal_output( 0, "Something bad happened the frag == NULL\n" );
|
||||
}
|
||||
num_progressed++;
|
||||
|
||||
}
|
||||
|
||||
return num_progressed;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user