checkpoint - works for 2 procs, but not more.
This commit was SVN r17477.
Этот коммит содержится в:
родитель
058e8d5f11
Коммит
1cd8a2e578
@ -157,6 +157,9 @@ BEGIN_C_DECLS
|
||||
/* tag that will be used as unique barrier identifier */
|
||||
long long tag;
|
||||
|
||||
/* barrier phase */
|
||||
int sm2_barrier_phase;
|
||||
|
||||
/* shared memory strucuture index - will be flip-flopping between structures */
|
||||
int sm_index;
|
||||
|
||||
@ -166,9 +169,6 @@ BEGIN_C_DECLS
|
||||
/* module pointer */
|
||||
struct mca_coll_sm2_module_t *coll_sm2_module;
|
||||
|
||||
/* barrier phase */
|
||||
int sm2_barrier_phase;
|
||||
|
||||
};
|
||||
typedef struct mca_coll_sm2_nb_request_process_private_mem_t
|
||||
mca_coll_sm2_nb_request_process_private_mem_t;
|
||||
|
@ -33,6 +33,10 @@ int mca_coll_sm2_allreduce_intra(void *sbuf, void *rbuf, int count,
|
||||
|
||||
/* get a pointer to the shared-memory working buffer */
|
||||
sm_buffer=alloc_sm2_shared_buffer(sm_module);
|
||||
/* debug
|
||||
fprintf(stderr," HHH sm_buffer %p pid %u \n",sm_buffer,getpid());
|
||||
fflush(stderr);
|
||||
end debug */
|
||||
if( NULL == sm_buffer) {
|
||||
rc=OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto Error;
|
||||
|
@ -116,7 +116,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
|
||||
/* Set my completion flag */
|
||||
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
|
||||
(sm_barrier_region+
|
||||
sm_module->barrier_tree.children_ranks[sm_module->barrier_tree.my_rank] *
|
||||
sm_module->barrier_tree.my_rank*
|
||||
sm_module->segement_size_per_process);
|
||||
sm_address->flag=tag;
|
||||
/* don't need memory barrier here, as we are not setting any other sm
|
||||
|
@ -324,10 +324,7 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
|
||||
mca_coll_sm2_module_t *module) {
|
||||
|
||||
/*local variables */
|
||||
int comm_size, my_rank, tree_order, rc;
|
||||
/*debug */
|
||||
int i,j;
|
||||
/* end debug */
|
||||
int i,j,comm_size, my_rank, tree_order, rc;
|
||||
|
||||
/* get order of fan-in and fan-out tree */
|
||||
tree_order=component->order_barrier_tree;
|
||||
@ -344,18 +341,6 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
goto Error;
|
||||
}
|
||||
/* debug */
|
||||
fprintf(stderr," CCCC my rank %d n_parents %d parent %d\n",
|
||||
my_rank,module->barrier_tree.n_parents,
|
||||
module->barrier_tree.parent_rank);
|
||||
fprintf(stderr," CCCC my rank %d n_children %d :: ",
|
||||
my_rank,module->barrier_tree.n_children);
|
||||
for (i=0 ; i < module->barrier_tree.n_children; i++ ) {
|
||||
fprintf(stderr," %d ",module->barrier_tree.children_ranks[i]);
|
||||
}
|
||||
fprintf(stderr," \n");
|
||||
fflush(stderr);
|
||||
/* end debug */
|
||||
|
||||
/* Allocate barrier control structures - allocating one barrier structure
|
||||
* per memory bank. Allocating two shared memory regions per bank. */
|
||||
@ -708,40 +693,45 @@ char *alloc_sm2_shared_buffer(mca_coll_sm2_module_t *module)
|
||||
* be initiated when a process is done with the buffer */
|
||||
if( module->sm2_allocated_buffer_index ==
|
||||
module->sm2_first_buffer_index_next_bank) {
|
||||
/*
|
||||
* complete non-blocking barrier, so this memory bank will
|
||||
* be available for use.
|
||||
*/
|
||||
memory_bank_index= module->sm2_allocated_buffer_index /
|
||||
module->sm2_module_num_regions_per_bank;
|
||||
request=&(module->barrier_request[memory_bank_index]);
|
||||
while ( NB_BARRIER_DONE != request->sm2_barrier_phase ) {
|
||||
rc=mca_coll_sm2_nbbarrier_intra_progress(module->module_comm,
|
||||
request,
|
||||
(struct mca_coll_base_module_1_1_0_t *)module);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
return NULL;
|
||||
}
|
||||
/* set the reqeust to inactive, and point current_request_index
|
||||
* to the request for the next memory bank
|
||||
*/
|
||||
/* set request to inactive */
|
||||
request->sm2_barrier_phase==NB_BARRIER_INACTIVE;
|
||||
/* move pointer to next request that needs to be completed */
|
||||
module->current_request_index=memory_bank_index+1;
|
||||
/* wrap around */
|
||||
if( module->current_request_index ==
|
||||
module->sm2_module_num_memory_banks ) {
|
||||
module->current_request_index=0;
|
||||
}
|
||||
}
|
||||
|
||||
/* re-set counter for next bank */
|
||||
module->sm2_first_buffer_index_next_bank +=
|
||||
memory_bank_index= module->sm2_allocated_buffer_index /
|
||||
module->sm2_module_num_regions_per_bank;
|
||||
if( module->sm2_first_buffer_index_next_bank ==
|
||||
module->sm2_module_num_memory_banks ) {
|
||||
module->sm2_module_num_memory_banks=0;
|
||||
|
||||
if ( NB_BARRIER_INACTIVE !=
|
||||
module->barrier_request[memory_bank_index].sm2_barrier_phase) {
|
||||
/*
|
||||
* complete non-blocking barrier, so this memory bank will
|
||||
* be available for use.
|
||||
*/
|
||||
request=&(module->barrier_request[memory_bank_index]);
|
||||
while ( NB_BARRIER_DONE != request->sm2_barrier_phase ) {
|
||||
rc=mca_coll_sm2_nbbarrier_intra_progress(module->module_comm,
|
||||
request,
|
||||
(struct mca_coll_base_module_1_1_0_t *)module);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
return NULL;
|
||||
}
|
||||
/* set the reqeust to inactive, and point current_request_index
|
||||
* to the request for the next memory bank
|
||||
*/
|
||||
/* set request to inactive */
|
||||
request->sm2_barrier_phase==NB_BARRIER_INACTIVE;
|
||||
/* move pointer to next request that needs to be completed */
|
||||
module->current_request_index=memory_bank_index+1;
|
||||
/* wrap around */
|
||||
if( module->current_request_index ==
|
||||
module->sm2_module_num_memory_banks ) {
|
||||
module->current_request_index=0;
|
||||
}
|
||||
}
|
||||
|
||||
/* re-set counter for next bank */
|
||||
module->sm2_first_buffer_index_next_bank +=
|
||||
module->sm2_module_num_regions_per_bank;
|
||||
if( module->sm2_first_buffer_index_next_bank ==
|
||||
module->sm2_module_num_memory_banks ) {
|
||||
module->sm2_module_num_memory_banks=0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -776,7 +766,7 @@ int free_sm2_shared_buffer(mca_coll_sm2_module_t *module)
|
||||
&(module->barrier_request[module->current_request_index]),
|
||||
(struct mca_coll_base_module_1_1_0_t *)module);
|
||||
if( OMPI_SUCCESS != rc ) {
|
||||
return NULL;
|
||||
return rc;
|
||||
}
|
||||
/* if barrier is completed, transition it to inactive, and point to
|
||||
* the request object for then next bank
|
||||
@ -809,7 +799,7 @@ int free_sm2_shared_buffer(mca_coll_sm2_module_t *module)
|
||||
module->sm2_module_num_regions_per_bank;
|
||||
request=&(module->barrier_request[memory_bank_index]);
|
||||
rc=mca_coll_sm2_nbbarrier_intra(module->module_comm,
|
||||
request,module);
|
||||
request,(mca_coll_base_module_1_1_0_t *)module);
|
||||
if( OMPI_SUCCESS !=rc ) {
|
||||
return rc;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user