1
1

bcol/basesmuma: fix several bugs in the basesmuma code

Found two bugs in basesmuma:

 - Release all resources when tearing down the bcol module.

 - Allways call the allreduce in the smcm code. We do not know
   beforehand whether all procs have all the files mapped.

cmr=v1.7.5:ticket=trac:4158

This commit was SVN r30623.

The following Trac tickets were found above:
  Ticket 4158 --> https://svn.open-mpi.org/trac/ompi/ticket/4158
Этот коммит содержится в:
Nathan Hjelm 2014-02-07 21:39:24 +00:00
родитель bc7cc09749
Коммит 77869c3232
2 изменённых файлов: 26 добавлений и 36 удалений

Просмотреть файл

@ -165,6 +165,9 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
sm_module->colls_no_user_data.ctl_buffs=NULL;
}
/* return control */
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->no_userdata_ctl);
/* colls_with_user_data resrouces */
/*
*debug print */
@ -187,6 +190,9 @@ mca_bcol_basesmuma_module_destruct(mca_bcol_basesmuma_module_t *sm_module)
sm_module->shared_memory_scratch_space=NULL;
}
/* return control */
opal_list_append (&cs->ctl_structures, (opal_list_item_t *) sm_module->userdata_ctl);
#if 1
if(sm_module->scatter_kary_tree) {
for(i=0 ; i < sm_module->super.size_of_subgroup ; i++ ) {

Просмотреть файл

@ -162,7 +162,7 @@ int bcol_basesmuma_smcm_allgather_connection(
/* define local variables */
int rc, i, fd, n_files_mapped;
int rc, i, fd;
ptrdiff_t mem_offset;
ompi_proc_t *proc_temp, *my_id;
bcol_basesmuma_smcm_proc_item_t *temp;
@ -189,39 +189,6 @@ int bcol_basesmuma_smcm_allgather_connection(
my_id = ompi_proc_local();
/* check to see if we have already mapped all the files, if we have
* just need to fill in backing_files array, and we are done
*/
for (i = 0, n_files_mapped = 0; i < module->group_size; i++) {
/* get the proc info */
proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);
if (i == sm_bcol_module->super.sbgp_partner_module->my_index) {
continue;
}
OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
/* if the vpid/jobid/filename combination already exists in the list,
then do not map this peer's file --- because you already have */
if (proc_temp->proc_name.vpid == item_ptr->peer.vpid &&
proc_temp->proc_name.jobid == item_ptr->peer.jobid
&& (strstr(item_ptr->sm_file.file_name,base_fname)) ){
/* record file data */
backing_files[i] = item_ptr;
n_files_mapped++;
/* found it - no need to continue looking */
break;
}
}
}
/* check to see if we are done - our own files are not in this list*/
if (n_files_mapped == (module->group_size-1) ) {
return OMPI_SUCCESS;
}
/* Phase One:
gather a list of processes that will participate in the allgather - I'm
preparing this list from the sbgp-ing module that was passed into the function */
@ -267,7 +234,25 @@ int bcol_basesmuma_smcm_allgather_connection(
* the group will have the same vpid/jobid pair. ignore this previously found
* mapping if map_all was requested (NTH: not sure why exactly since we re-map
* and already mapped file) */
if (sm_bcol_module->super.sbgp_partner_module->my_index == i || (!map_all && backing_files[i])) {
if (sm_bcol_module->super.sbgp_partner_module->my_index == i) {
continue;
}
proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);
OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
/* if the vpid/jobid/filename combination already exists in the list,
then do not map this peer's file --- because you already have */
if (proc_temp->proc_name.vpid == item_ptr->peer.vpid &&
proc_temp->proc_name.jobid == item_ptr->peer.jobid &&
0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) {
/* record file data */
backing_files[i] = item_ptr;
break;
}
}
if (!map_all && backing_files[i]) {
continue;
}
@ -302,7 +287,6 @@ int bcol_basesmuma_smcm_allgather_connection(
fd = open(temp->sm_file.file_name, O_RDWR, 0600);
if (0 > fd) {
opal_output (0, "SMCM Allgather failed to open sm backing file %s. errno = %d\n", temp->sm_file.file_name, errno);
assert (0);
rc = OMPI_ERROR;
goto Error;
}