1
1
openmpi/ompi/mca/bcol/basesmuma/bcol_basesmuma_setup.c
2013-02-05 21:52:55 +00:00

598 строки
20 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/patterns/comm/coll_ops.h"
#include "opal/class/opal_object.h"
#include "opal/dss/dss.h"
#include "bcol_basesmuma.h"
int base_bcol_basesmuma_setup_ctl_struct(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt);
/* this is the new one, uses the pml allgather */
int base_bcol_basesmuma_exchange_offsets(
mca_bcol_basesmuma_module_t *sm_bcol_module,
void **result_array, uint64_t mem_offset, int loop_limit,
int leading_dim)
{
int ret=OMPI_SUCCESS,i;
int count;
int index_in_group;
char *send_buff;
char *recv_buff;
uint64_t rem_mem_offset;
/* malloc some memory */
count = sizeof(uint64_t) + sizeof(int);
send_buff = (char *) malloc(count);
recv_buff = (char *) malloc(count *
sm_bcol_module->super.sbgp_partner_module->group_size);
/* exchange the base pointer for the controls structures - gather
* every one else's infromation.
*/
/* pack the offset of the allocated region */
memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int));
memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t));
/* get the offsets from all procs, so can setup the control data
* structures.
*/
ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count,
MPI_BYTE,
sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
/* get the control stucture offsets within the shared memory
* region and populate the control structures - we do not assume
* any symmetry in memory layout of each process
*/
/* loop over the procs in the group */
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
int array_id;
/* get this peer's index in the group */
memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int));
/* get the offset */
memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t));
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
result_array[array_id]=(void *)rem_mem_offset;
}
ERROR:
/* clean up */
if( NULL != send_buff ) {
free(send_buff);
send_buff = NULL;
}
if( NULL != recv_buff ) {
free(recv_buff);
recv_buff = NULL;
}
return ret;
}
#if 0
int base_bcol_basesmuma_exchange_offsets(
mca_bcol_basesmuma_module_t *sm_bcol_module,
void **result_array, uint64_t mem_offset, int loop_limit,
int leading_dim)
{
int ret=OMPI_SUCCESS,i,dummy;
int index_in_group, pcnt;
opal_list_t peers;
ompi_namelist_t *peer;
ompi_proc_t *proc_temp, *my_id;
opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t);
opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
uint64_t rem_mem_offset;
/* exchange the base pointer for the controls structures - gather
* every one else's infromation.
*/
/* get list of procs that will participate in the communication */
OBJ_CONSTRUCT(&peers, opal_list_t);
for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) {
/* get the proc info */
proc_temp = ompi_comm_peer_lookup(
sm_bcol_module->super.sbgp_partner_module->group_comm,
sm_bcol_module->super.sbgp_partner_module->group_list[i]);
peer = OBJ_NEW(ompi_namelist_t);
peer->name.jobid = proc_temp->proc_name.jobid;
peer->name.vpid = proc_temp->proc_name.vpid;
opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */
}
/* pack up the data into the allgather send buffer */
if (NULL == send_buffer || NULL == recv_buffer) {
fprintf(stderr,"Cannot allocate memory for sbuffer or rbuffer \n");
fflush(stderr);
ret = OMPI_ERROR;
goto ERROR;
}
/* get my proc information */
my_id = ompi_proc_local();
/* pack my information */
ret = opal_dss.pack(send_buffer,
&(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);
if (OMPI_SUCCESS != ret) {
goto ERROR;
fprintf(stderr,"ORTE error packing my_index!!\n");
fflush(stderr);
}
/* pack the offset of the allocated region */
ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64);
if (OMPI_SUCCESS != ret) {
goto ERROR;
}
/* get the offsets from all procs, so can setup the control data
* structures.
*/
if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) {
fprintf(stderr,"ompi_rte_allgather_list returned error %d\n", ret);
fflush(stderr);
goto ERROR;
}
/* unpack the dummy */
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32);
if (OMPI_SUCCESS != ret) {
fprintf(stderr,"unpack returned error %d for dummy \n",ret);
fflush(stderr);
goto ERROR;
}
/* get the control stucture offsets within the shared memory
* region and populate the control structures - we do not assume
* any symmetry in memory layout of each process
*/
/* loop over the procs in the group */
for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
int array_id;
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32);
if (OMPI_SUCCESS != ret) {
fprintf(stderr,"unpack returned error %d for remote index_in_group \n",ret);
fflush(stderr);
goto ERROR;
}
/* get the offset */
pcnt=1;
ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64);
if (OMPI_SUCCESS != ret) {
fprintf(stderr,"unpack returned error %d for remote memory offset \n",ret);
fflush(stderr);
goto ERROR;
}
array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
result_array[array_id]=(void *)rem_mem_offset;
}
/* clean up */
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
while( NULL !=peer) {
OBJ_RELEASE(peer);
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
}
OBJ_DESTRUCT(&peers);
if( send_buffer ) {
OBJ_RELEASE(send_buffer);
}
if( recv_buffer ) {
OBJ_RELEASE(recv_buffer);
}
return ret;
ERROR:
/* free peer list */
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
while( NULL !=peer) {
OBJ_RELEASE(peer);
peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
}
OBJ_DESTRUCT(&peers);
if( send_buffer ) {
OBJ_RELEASE(send_buffer);
}
if( recv_buffer ) {
OBJ_RELEASE(recv_buffer);
}
return ret;
}
#endif
static int base_bcol_basesmuma_exchange_ctl_params(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk)
{
int ret=OMPI_SUCCESS,i,loop_limit;
int leading_dim, buf_id;
uint64_t mem_offset;
unsigned char *base_ptr;
mca_bcol_basesmuma_ctl_struct_t *ctl_ptr;
/* data block base offset in the mapped file */
mem_offset=(uint64_t)(data_blk->data)-
(uint64_t)cs->sm_ctl_structs->data_addr;
/* number of buffers in data block */
loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs;
leading_dim=ctl_mgmt->size_of_group;
ret=comm_allgather_pml(&mem_offset,ctl_mgmt->ctl_buffs,1,
MPI_LONG_LONG_INT,
sm_bcol_module->super.sbgp_partner_module->my_index,
sm_bcol_module->super.sbgp_partner_module->group_size,
sm_bcol_module->super.sbgp_partner_module->group_list,
sm_bcol_module->super.sbgp_partner_module->group_comm);
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
#if 0
ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module,
(void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim);
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
#endif
/* convert memory offset to virtual address in current rank */
for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {
/* get the base pointer */
int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
/* me */
base_ptr=cs->sm_ctl_structs->map_addr;
} else {
base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr;
}
ctl_mgmt->ctl_buffs[array_id]=(void *)
(((uint64_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)base_ptr);
for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
ctl_mgmt->ctl_buffs[array_id]=(void *) ((uint64_t)(ctl_mgmt->ctl_buffs[array_id_m1])+
(uint64_t)sizeof(mca_bcol_basesmuma_ctl_struct_t));
}
}
/* initialize my control structues */
for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) {
int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index;
int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *)
ctl_mgmt->ctl_buffs[array_id];
/* initialize the data structures - RLG, this is only one data
* structure that needs to be initialized, more are missing */
ctl_ptr->sequence_number=-1;
ctl_ptr->flag=-1;
ctl_ptr->index=0;
ctl_ptr->src_ptr = NULL;
}
return ret;
ERROR:
return ret;
}
int base_bcol_basesmuma_setup_ctl_struct(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs,
sm_buffer_mgmt *ctl_mgmt)
{
int ret=OMPI_SUCCESS,i,n_ctl,n_levels;
int n_ctl_structs;
int cnt;
size_t malloc_size;
bcol_basesmuma_smcm_file_t input_file;
/*
* set my no user-data conrol structures
*/
/* number of banks and regions per bank are already a power of 2 */
n_ctl_structs=cs->basesmuma_num_mem_banks*
cs->basesmuma_num_regions_per_bank;
/* initialize the control structure management struct -
* for collectives without user data
*---------------------------------------------------------------
*/
ctl_mgmt->number_of_buffs=n_ctl_structs;
ctl_mgmt->num_mem_banks=
cs->basesmuma_num_mem_banks;
ctl_mgmt->num_buffs_per_mem_bank=
cs->basesmuma_num_regions_per_bank;
ctl_mgmt->size_of_group=
sm_bcol_module->super.sbgp_partner_module->group_size;
roundup_to_power_radix(2,cs->basesmuma_num_regions_per_bank,&n_levels);
ctl_mgmt->log2_num_buffs_per_mem_bank=n_levels;
roundup_to_power_radix(2,n_ctl_structs,&n_levels);
ctl_mgmt->log2_number_of_buffs=n_levels;
ctl_mgmt->mask=n_ctl_structs-1;
sm_bcol_module->super.n_poll_loops=cs->n_poll_loops;
malloc_size=
(ctl_mgmt->number_of_buffs +
ctl_mgmt->num_mem_banks ) *
ctl_mgmt->size_of_group *
sizeof(void *);
ctl_mgmt->ctl_buffs= malloc(malloc_size);
if( !ctl_mgmt->ctl_buffs ) {
ret=OMPI_ERR_OUT_OF_RESOURCE;
goto ERROR;
}
/* exchange remote addressing information */
input_file.file_name=cs->sm_ctl_structs->map_path;
input_file.size=cs->sm_ctl_structs->map_size;
input_file.size_ctl_structure=0;
input_file.data_seg_alignment=CACHE_LINE_SIZE;
input_file.mpool_size=cs->sm_ctl_structs->map_size;
ret=bcol_basesmuma_smcm_allgather_connection(
sm_bcol_module,
sm_bcol_module->super.sbgp_partner_module,
&(cs->sm_connections_list),
&(sm_bcol_module->ctl_backing_files_info),
sm_bcol_module->super.sbgp_partner_module->group_comm,
input_file, cs->clt_base_fname,
false);
/* fill in the pointer to other ranks scartch shared memory */
sm_bcol_module->shared_memory_scratch_space=
malloc(sizeof(void *)*
sm_bcol_module->super.sbgp_partner_module->group_size);
if( !sm_bcol_module->shared_memory_scratch_space ) {
fprintf(stderr,"Cannot allocate memory for shared_memory_scratch_space. \n");
fflush(stderr);
ret = OMPI_ERR_OUT_OF_RESOURCE;
goto ERROR;
}
for(i=0 ; i < sm_bcol_module->super.sbgp_partner_module->group_size ; i++ )
{
if(i ==
sm_bcol_module->super.sbgp_partner_module->my_index) {
/* local file data is not cached in thi slist */
continue;
}
sm_bcol_module->shared_memory_scratch_space[i]=(void *)(
(char *)(sm_bcol_module->ctl_backing_files_info[i]->sm_mmap)+
cs->scratch_offset_from_base_ctl_file);
}
i=sm_bcol_module->super.sbgp_partner_module->my_index;
sm_bcol_module->shared_memory_scratch_space[i]=(void *)(
(char *)(cs->sm_ctl_structs->map_addr)+cs->scratch_offset_from_base_ctl_file);
/*
* setup the no-data buffer managment data
*/
n_ctl=ctl_mgmt->num_mem_banks;
ctl_mgmt->ctl_buffs_mgmt=(mem_bank_management_t *)
malloc(sizeof(mem_bank_management_t)*n_ctl);
if( !ctl_mgmt->ctl_buffs_mgmt ) {
fprintf(stderr,"Cannot allocate memory for ctl_buffs_mgmt. ret = %d \n",ret);
fflush(stderr);
ret = OMPI_ERROR;
goto ERROR;
}
/* initialize each individual element */
cnt=cs->basesmuma_num_regions_per_bank*cs->basesmuma_num_mem_banks;
for( i=0 ; i < n_ctl ; i++ ) {
opal_list_item_t *item;
opal_mutex_t *mutex_ptr;
ctl_mgmt->ctl_buffs_mgmt[i].bank_gen_counter= 0;
ctl_mgmt->ctl_buffs_mgmt[i].available_buffers=
ctl_mgmt->num_buffs_per_mem_bank;
ctl_mgmt->ctl_buffs_mgmt[i].number_of_buffers=
ctl_mgmt->num_buffs_per_mem_bank;
ctl_mgmt->ctl_buffs_mgmt[i].n_buffs_freed= 0;
mutex_ptr= &(ctl_mgmt->ctl_buffs_mgmt[i].mutex);
OBJ_CONSTRUCT(mutex_ptr, opal_mutex_t);
ctl_mgmt->ctl_buffs_mgmt[i].index_shared_mem_ctl_structs=i;
item=(opal_list_item_t *)&(ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc);
OBJ_CONSTRUCT(item,opal_list_item_t);
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.sm_module=
sm_bcol_module;
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.pool_index= i;
/* get the sm_buffer_mgmt pointer for the control structures */
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.coll_buff=ctl_mgmt;
ctl_mgmt->ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
NULL;
cnt++;
}
return ret;
ERROR:
return ret;
}
/*
* this function initializes the internal scratch buffers and control
* structures that will be used by the module. It also intitializes
* the payload buffer management structures.
*/
int base_bcol_basesmuma_setup_library_buffers(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *cs)
{
int ret=OMPI_SUCCESS,i;
int n_ctl_structs;
size_t ctl_segement_size,total_memory;
int max_elements;
unsigned char *data_ptr;
/* */
/* setup the control struct memory */
if(!cs->sm_ctl_structs) {
ret = mca_bcol_basesmuma_allocate_sm_ctl_memory(cs);
if(OMPI_SUCCESS != ret) {
fprintf(stderr,"In bcol_comm_query mca_bcol_basesmuma_allocate_sm_ctl_memory failed \n");
fflush(stderr);
return ret;
}
/*
* put the memory onto the free list - we have worried about
* alignment in the mpool allocation, and assume that the
* ctl structures have the approriate size to mantain alignment
*/
/* figure out segment size */
n_ctl_structs=cs->basesmuma_num_mem_banks*
cs->basesmuma_num_regions_per_bank;
/* add memory for the control structure used for recycling the banks */
n_ctl_structs+=cs->basesmuma_num_mem_banks;
ctl_segement_size=n_ctl_structs*
sizeof(mca_bcol_basesmuma_ctl_struct_t);
total_memory=cs->sm_ctl_structs->map_size - (
(char *)(cs->sm_ctl_structs->data_addr)-
(char *)(cs->sm_ctl_structs->map_addr));
total_memory-=cs->my_scratch_shared_memory_size;
max_elements=total_memory/ctl_segement_size;
/* populate the free list */
data_ptr=cs->sm_ctl_structs->data_addr;
for( i=0 ; i < max_elements ; i++ ) {
list_data_t *item=OBJ_NEW(list_data_t);
if( !item ) {
ret=OMPI_ERR_OUT_OF_RESOURCE;
goto ERROR;
}
item->data=(void *)data_ptr;
opal_list_append(&(cs->ctl_structures),(opal_list_item_t *)item);
data_ptr+=ctl_segement_size;
}
/* set the scratch memory pointer and offset */
cs->my_scratch_shared_memory=(char *)data_ptr;
cs->scratch_offset_from_base_ctl_file=(size_t)
((char *)data_ptr-(char *)cs->sm_ctl_structs->map_addr);
/* At this stage the memory is mapped and ready to use by the local rank.
* However, the memory of other processes has not yet been mmaped into the
* memory of this process.
*/
}
/* intialize no_userdata_ctl */
sm_bcol_module->no_userdata_ctl=(list_data_t *)
opal_list_remove_last(&(cs->ctl_structures));
if( !sm_bcol_module->no_userdata_ctl) {
ret=OMPI_ERR_OUT_OF_RESOURCE;
goto ERROR;
}
ret=base_bcol_basesmuma_setup_ctl_struct(
sm_bcol_module, cs, &(sm_bcol_module->colls_no_user_data));
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
/* intialize userdata_ctl */
sm_bcol_module->userdata_ctl=(list_data_t *)
opal_list_remove_last(&(cs->ctl_structures));
if( !sm_bcol_module->userdata_ctl) {
ret=OMPI_ERR_OUT_OF_RESOURCE;
goto ERROR;
}
ret=base_bcol_basesmuma_setup_ctl_struct(
sm_bcol_module, cs, &(sm_bcol_module->colls_with_user_data));
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
/* used for blocking recursive doubling barrier */
sm_bcol_module->index_blocking_barrier_memory_bank=0;
/* gather the offsets of the control structs relative to the base
* of the shared memory file, and fill in the table with the
* address of all the control structues.
*/
ret= base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
&(sm_bcol_module->colls_no_user_data),sm_bcol_module->no_userdata_ctl);
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
ret= base_bcol_basesmuma_exchange_ctl_params(sm_bcol_module, cs,
&(sm_bcol_module->colls_with_user_data),sm_bcol_module->userdata_ctl);
if( OMPI_SUCCESS != ret ) {
goto ERROR;
}
return ret;
ERROR:
return ret;
}
OBJ_CLASS_INSTANCE(list_data_t,
opal_list_item_t, NULL, NULL);