1
1

coll/ml: fix assertion failure that occurs when level 0 of the hierarchy

fails to select any processes on any nodes.

Also modified basesmsocket to only print debugging info to the framework
output.

cmr=v1.7.5:reviewer=jsquyres

This commit was SVN r31071.
Этот коммит содержится в:
Nathan Hjelm 2014-03-14 19:39:00 +00:00
родитель fbc5e3b773
Коммит 1911d97044
3 изменённых файлов: 24 добавлений и 41 удалений

Просмотреть файл

@ -1762,7 +1762,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
i_hier = 0; i_hier = 0;
while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){ while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
/* /*
** obtain the list of ranks in the current level ** obtain the list of ranks in the current level
*/ */
@ -1935,12 +1934,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
&cum_number_ranks_in_all_subgroups, &cum_number_ranks_in_all_subgroups,
&num_total_subgroups, map_to_comm_ranks,i_hier); &num_total_subgroups, map_to_comm_ranks,i_hier);
/* The way initialization is currently written *all* ranks MUST appear
* in the first level (0) of the hierarchy. If any rank is not in the first
* level then the calculation of gather/scatter offsets will be wrong.
* NTH: DO NOT REMOVE this assert until this changes! */
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
if( OMPI_SUCCESS != ret ) { if( OMPI_SUCCESS != ret ) {
ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret)); ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret));
goto exit_ERROR; goto exit_ERROR;
@ -2058,13 +2051,21 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
goto SelectionDone; goto SelectionDone;
} }
n_procs_in = n_remain;
/* take the next element */ /* take the next element */
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli); sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli); bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
i_hier++; if (n_remain != n_procs_in) {
i_hier++;
/* The way initialization is currently written *all* ranks MUST appear
* in the first level (0) of the hierarchy. If any rank is not in the first
* level then the calculation of gather/scatter offsets will be wrong.
* NTH: DO NOT REMOVE this assert until this changes! */
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
}
n_procs_in = n_remain;
} }
SelectionDone: SelectionDone:

Просмотреть файл

@ -1,6 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -16,9 +19,11 @@
#include "mpi.h" #include "mpi.h"
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
#include "ompi/mca/sbgp/sbgp.h" #include "ompi/mca/sbgp/sbgp.h"
#include "ompi/mca/sbgp/base/base.h"
#include "ompi/mca/mpool/mpool.h" #include "ompi/mca/mpool/mpool.h"
#include "ompi/request/request.h" #include "ompi/request/request.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "opal/util/output.h"
BEGIN_C_DECLS BEGIN_C_DECLS
@ -29,33 +34,11 @@ BEGIN_C_DECLS
# define SPIN # define SPIN
#endif #endif
#define BASESMSOCKET_VERBOSE(level, ...) \
static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...) do { \
{ OPAL_OUTPUT_VERBOSE((ompi_sbgp_base_framework.framework_output, level, \
va_list list; __VA_ARGS__)); \
int ret;
va_start(list, fmt);
ret = vfprintf(stderr, fmt, list);
va_end(list);
return ret;
}
#if OPAL_ENABLE_DEBUG
#define BASESMSOCKET_VERBOSE(level, args) \
do { \
if(10 >= level) { \
mca_sbgp_basesmsocket_err("[%s]%s[%s:%d:%s] BASESMSOCKET ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_sbgp_basesmsocket_err args; \
mca_sbgp_basesmsocket_err("\n"); \
} \
} while(0); } while(0);
#else
#define BASESMSOCKET_VERBOSE(level, args)
#endif
/** /**
* Structure to hold the basic shared memory coll component. First it holds the * Structure to hold the basic shared memory coll component. First it holds the
@ -66,7 +49,6 @@ static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...)
struct mca_sbgp_basesmsocket_component_t { struct mca_sbgp_basesmsocket_component_t {
/** Base coll component */ /** Base coll component */
mca_sbgp_base_component_2_0_0_t super; mca_sbgp_base_component_2_0_0_t super;
}; };
/** /**

Просмотреть файл

@ -172,7 +172,7 @@ static int mca_sbgp_map_to_logical_socket_id(int *socket)
/* get this process' CPU binding */ /* get this process' CPU binding */
if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){ if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){
/* report some error */ /* report some error */
BASESMSOCKET_VERBOSE(10, ("The global variable opal_hwloc_topology appears not to have been initialized\n")); BASESMSOCKET_VERBOSE(10, "The global variable opal_hwloc_topology appears not to have been initialized\n");
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -292,7 +292,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
my_socket_index=-1; my_socket_index=-1;
/*debug print*/ /*debug print*/
/* */ /* */
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank)); BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank);
/*end debug*/ /*end debug*/
goto NoLocalPeers; goto NoLocalPeers;
} else { } else {
@ -303,7 +303,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
* by the hwloc API are unique. * by the hwloc API are unique.
*/ */
if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)){ if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)){
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank)); BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group !!!\n",my_rank);
goto NoLocalPeers; goto NoLocalPeers;
} }
@ -394,7 +394,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
ret=comm_allgather_pml(&my_socket_info, socket_info, 1, ret=comm_allgather_pml(&my_socket_info, socket_info, 1,
MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm);
if (OMPI_SUCCESS != ret ) { if (OMPI_SUCCESS != ret ) {
BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret)); BASESMSOCKET_VERBOSE(10, "comm_allgather_pml returned error %d\n",ret);
return NULL; return NULL;
} }