1
1

checkpoint - works for 2 procs, but not more.

This commit was SVN r17477.
Этот коммит содержится в:
Rich Graham 2008-02-17 05:21:58 +00:00
родитель 058e8d5f11
Коммит 1cd8a2e578
4 изменённых файлов: 48 добавлений и 54 удалений

Просмотреть файл

@ -157,6 +157,9 @@ BEGIN_C_DECLS
/* tag that will be used as unique barrier identifier */
long long tag;
/* barrier phase */
int sm2_barrier_phase;
/* shared memory strucuture index - will be flip-flopping between structures */
int sm_index;
@ -166,9 +169,6 @@ BEGIN_C_DECLS
/* module pointer */
struct mca_coll_sm2_module_t *coll_sm2_module;
/* barrier phase */
int sm2_barrier_phase;
};
typedef struct mca_coll_sm2_nb_request_process_private_mem_t
mca_coll_sm2_nb_request_process_private_mem_t;

Просмотреть файл

@ -33,6 +33,10 @@ int mca_coll_sm2_allreduce_intra(void *sbuf, void *rbuf, int count,
/* get a pointer to the shared-memory working buffer */
sm_buffer=alloc_sm2_shared_buffer(sm_module);
/* debug
fprintf(stderr," HHH sm_buffer %p pid %u \n",sm_buffer,getpid());
fflush(stderr);
end debug */
if( NULL == sm_buffer) {
rc=OMPI_ERR_OUT_OF_RESOURCE;
goto Error;

Просмотреть файл

@ -116,7 +116,7 @@ int mca_coll_sm2_nbbarrier_intra(struct ompi_communicator_t *comm,
/* Set my completion flag */
sm_address=(mca_coll_sm2_nb_request_process_shared_mem_t *)
(sm_barrier_region+
sm_module->barrier_tree.children_ranks[sm_module->barrier_tree.my_rank] *
sm_module->barrier_tree.my_rank*
sm_module->segement_size_per_process);
sm_address->flag=tag;
/* don't need memory barrier here, as we are not setting any other sm

Просмотреть файл

@ -324,10 +324,7 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
mca_coll_sm2_module_t *module) {
/*local variables */
int comm_size, my_rank, tree_order, rc;
/*debug */
int i,j;
/* end debug */
int i,j,comm_size, my_rank, tree_order, rc;
/* get order of fan-in and fan-out tree */
tree_order=component->order_barrier_tree;
@ -344,18 +341,6 @@ static int init_sm2_barrier(struct ompi_communicator_t *comm,
if( OMPI_SUCCESS != rc ) {
goto Error;
}
/* debug */
fprintf(stderr," CCCC my rank %d n_parents %d parent %d\n",
my_rank,module->barrier_tree.n_parents,
module->barrier_tree.parent_rank);
fprintf(stderr," CCCC my rank %d n_children %d :: ",
my_rank,module->barrier_tree.n_children);
for (i=0 ; i < module->barrier_tree.n_children; i++ ) {
fprintf(stderr," %d ",module->barrier_tree.children_ranks[i]);
}
fprintf(stderr," \n");
fflush(stderr);
/* end debug */
/* Allocate barrier control structures - allocating one barrier structure
* per memory bank. Allocating two shared memory regions per bank. */
@ -708,40 +693,45 @@ char *alloc_sm2_shared_buffer(mca_coll_sm2_module_t *module)
* be initiated when a process is done with the buffer */
if( module->sm2_allocated_buffer_index ==
module->sm2_first_buffer_index_next_bank) {
/*
* complete non-blocking barrier, so this memory bank will
* be available for use.
*/
memory_bank_index= module->sm2_allocated_buffer_index /
module->sm2_module_num_regions_per_bank;
request=&(module->barrier_request[memory_bank_index]);
while ( NB_BARRIER_DONE != request->sm2_barrier_phase ) {
rc=mca_coll_sm2_nbbarrier_intra_progress(module->module_comm,
request,
(struct mca_coll_base_module_1_1_0_t *)module);
if( OMPI_SUCCESS != rc ) {
return NULL;
}
/* set the reqeust to inactive, and point current_request_index
* to the request for the next memory bank
*/
/* set request to inactive */
request->sm2_barrier_phase==NB_BARRIER_INACTIVE;
/* move pointer to next request that needs to be completed */
module->current_request_index=memory_bank_index+1;
/* wrap around */
if( module->current_request_index ==
module->sm2_module_num_memory_banks ) {
module->current_request_index=0;
}
}
/* re-set counter for next bank */
module->sm2_first_buffer_index_next_bank +=
memory_bank_index= module->sm2_allocated_buffer_index /
module->sm2_module_num_regions_per_bank;
if( module->sm2_first_buffer_index_next_bank ==
module->sm2_module_num_memory_banks ) {
module->sm2_module_num_memory_banks=0;
if ( NB_BARRIER_INACTIVE !=
module->barrier_request[memory_bank_index].sm2_barrier_phase) {
/*
* complete non-blocking barrier, so this memory bank will
* be available for use.
*/
request=&(module->barrier_request[memory_bank_index]);
while ( NB_BARRIER_DONE != request->sm2_barrier_phase ) {
rc=mca_coll_sm2_nbbarrier_intra_progress(module->module_comm,
request,
(struct mca_coll_base_module_1_1_0_t *)module);
if( OMPI_SUCCESS != rc ) {
return NULL;
}
/* set the reqeust to inactive, and point current_request_index
* to the request for the next memory bank
*/
/* set request to inactive */
request->sm2_barrier_phase==NB_BARRIER_INACTIVE;
/* move pointer to next request that needs to be completed */
module->current_request_index=memory_bank_index+1;
/* wrap around */
if( module->current_request_index ==
module->sm2_module_num_memory_banks ) {
module->current_request_index=0;
}
}
/* re-set counter for next bank */
module->sm2_first_buffer_index_next_bank +=
module->sm2_module_num_regions_per_bank;
if( module->sm2_first_buffer_index_next_bank ==
module->sm2_module_num_memory_banks ) {
module->sm2_module_num_memory_banks=0;
}
}
}
@ -776,7 +766,7 @@ int free_sm2_shared_buffer(mca_coll_sm2_module_t *module)
&(module->barrier_request[module->current_request_index]),
(struct mca_coll_base_module_1_1_0_t *)module);
if( OMPI_SUCCESS != rc ) {
return NULL;
return rc;
}
/* if barrier is completed, transition it to inactive, and point to
* the request object for then next bank
@ -809,7 +799,7 @@ int free_sm2_shared_buffer(mca_coll_sm2_module_t *module)
module->sm2_module_num_regions_per_bank;
request=&(module->barrier_request[memory_bank_index]);
rc=mca_coll_sm2_nbbarrier_intra(module->module_comm,
request,module);
request,(mca_coll_base_module_1_1_0_t *)module);
if( OMPI_SUCCESS !=rc ) {
return rc;
}