coll/ml: fix assertion failure that occurs when level 0 of the hierarchy
fails to select any processes on any nodes. Also modified basesmsocket to only print debugging info to the framework output. cmr=v1.7.5:reviewer=jsquyres This commit was SVN r31071.
Этот коммит содержится в:
родитель
fbc5e3b773
Коммит
1911d97044
@ -1762,7 +1762,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
|
||||
i_hier = 0;
|
||||
while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
|
||||
|
||||
/*
|
||||
** obtain the list of ranks in the current level
|
||||
*/
|
||||
@ -1935,12 +1934,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
&cum_number_ranks_in_all_subgroups,
|
||||
&num_total_subgroups, map_to_comm_ranks,i_hier);
|
||||
|
||||
/* The way initialization is currently written *all* ranks MUST appear
|
||||
* in the first level (0) of the hierarchy. If any rank is not in the first
|
||||
* level then the calculation of gather/scatter offsets will be wrong.
|
||||
* NTH: DO NOT REMOVE this assert until this changes! */
|
||||
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
|
||||
|
||||
if( OMPI_SUCCESS != ret ) {
|
||||
ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret));
|
||||
goto exit_ERROR;
|
||||
@ -2058,13 +2051,21 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
||||
goto SelectionDone;
|
||||
}
|
||||
|
||||
n_procs_in = n_remain;
|
||||
|
||||
/* take the next element */
|
||||
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
|
||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
||||
|
||||
if (n_remain != n_procs_in) {
|
||||
i_hier++;
|
||||
|
||||
/* The way initialization is currently written *all* ranks MUST appear
|
||||
* in the first level (0) of the hierarchy. If any rank is not in the first
|
||||
* level then the calculation of gather/scatter offsets will be wrong.
|
||||
* NTH: DO NOT REMOVE this assert until this changes! */
|
||||
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
|
||||
}
|
||||
|
||||
n_procs_in = n_remain;
|
||||
}
|
||||
|
||||
SelectionDone:
|
||||
|
@ -1,6 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -16,9 +19,11 @@
|
||||
#include "mpi.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "ompi/mca/sbgp/sbgp.h"
|
||||
#include "ompi/mca/sbgp/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -29,33 +34,11 @@ BEGIN_C_DECLS
|
||||
# define SPIN
|
||||
#endif
|
||||
|
||||
|
||||
static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...)
|
||||
{
|
||||
va_list list;
|
||||
int ret;
|
||||
|
||||
va_start(list, fmt);
|
||||
ret = vfprintf(stderr, fmt, list);
|
||||
va_end(list);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
#define BASESMSOCKET_VERBOSE(level, args) \
|
||||
#define BASESMSOCKET_VERBOSE(level, ...) \
|
||||
do { \
|
||||
if(10 >= level) { \
|
||||
mca_sbgp_basesmsocket_err("[%s]%s[%s:%d:%s] BASESMSOCKET ", \
|
||||
ompi_process_info.nodename, \
|
||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__, __func__); \
|
||||
mca_sbgp_basesmsocket_err args; \
|
||||
mca_sbgp_basesmsocket_err("\n"); \
|
||||
} \
|
||||
OPAL_OUTPUT_VERBOSE((ompi_sbgp_base_framework.framework_output, level, \
|
||||
__VA_ARGS__)); \
|
||||
} while(0);
|
||||
#else
|
||||
#define BASESMSOCKET_VERBOSE(level, args)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Structure to hold the basic shared memory coll component. First it holds the
|
||||
@ -66,7 +49,6 @@ static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...)
|
||||
struct mca_sbgp_basesmsocket_component_t {
|
||||
/** Base coll component */
|
||||
mca_sbgp_base_component_2_0_0_t super;
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -172,7 +172,7 @@ static int mca_sbgp_map_to_logical_socket_id(int *socket)
|
||||
/* get this process' CPU binding */
|
||||
if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){
|
||||
/* report some error */
|
||||
BASESMSOCKET_VERBOSE(10, ("The global variable opal_hwloc_topology appears not to have been initialized\n"));
|
||||
BASESMSOCKET_VERBOSE(10, "The global variable opal_hwloc_topology appears not to have been initialized\n");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
@ -292,7 +292,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
||||
my_socket_index=-1;
|
||||
/*debug print*/
|
||||
/* */
|
||||
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank));
|
||||
BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank);
|
||||
/*end debug*/
|
||||
goto NoLocalPeers;
|
||||
} else {
|
||||
@ -303,7 +303,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
||||
* by the hwloc API are unique.
|
||||
*/
|
||||
if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)){
|
||||
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank));
|
||||
BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group !!!\n",my_rank);
|
||||
|
||||
goto NoLocalPeers;
|
||||
}
|
||||
@ -394,7 +394,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
||||
ret=comm_allgather_pml(&my_socket_info, socket_info, 1,
|
||||
MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm);
|
||||
if (OMPI_SUCCESS != ret ) {
|
||||
BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret));
|
||||
BASESMSOCKET_VERBOSE(10, "comm_allgather_pml returned error %d\n",ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user