coll/ml: fix assertion failure that occurs when level 0 of the hierarchy
fails to select any processes on any nodes. Also modified basesmsocket to only print debugging info to the framework output. cmr=v1.7.5:reviewer=jsquyres This commit was SVN r31071.
Этот коммит содержится в:
родитель
fbc5e3b773
Коммит
1911d97044
@ -1762,7 +1762,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|||||||
|
|
||||||
i_hier = 0;
|
i_hier = 0;
|
||||||
while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
|
while ((opal_list_item_t *) sbgp_cli != opal_list_get_end(&mca_sbgp_base_components_in_use)){
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** obtain the list of ranks in the current level
|
** obtain the list of ranks in the current level
|
||||||
*/
|
*/
|
||||||
@ -1935,12 +1934,6 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|||||||
&cum_number_ranks_in_all_subgroups,
|
&cum_number_ranks_in_all_subgroups,
|
||||||
&num_total_subgroups, map_to_comm_ranks,i_hier);
|
&num_total_subgroups, map_to_comm_ranks,i_hier);
|
||||||
|
|
||||||
/* The way initialization is currently written *all* ranks MUST appear
|
|
||||||
* in the first level (0) of the hierarchy. If any rank is not in the first
|
|
||||||
* level then the calculation of gather/scatter offsets will be wrong.
|
|
||||||
* NTH: DO NOT REMOVE this assert until this changes! */
|
|
||||||
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
|
|
||||||
|
|
||||||
if( OMPI_SUCCESS != ret ) {
|
if( OMPI_SUCCESS != ret ) {
|
||||||
ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret));
|
ML_VERBOSE(10, (" Error: get_new_subgroup_data returned %d \n",ret));
|
||||||
goto exit_ERROR;
|
goto exit_ERROR;
|
||||||
@ -2058,13 +2051,21 @@ static int mca_coll_ml_tree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|||||||
goto SelectionDone;
|
goto SelectionDone;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_procs_in = n_remain;
|
|
||||||
|
|
||||||
/* take the next element */
|
/* take the next element */
|
||||||
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
|
sbgp_cli = (sbgp_base_component_keyval_t *) opal_list_get_next((opal_list_item_t *) sbgp_cli);
|
||||||
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
bcol_cli = (mca_base_component_list_item_t *) opal_list_get_next((opal_list_item_t *) bcol_cli);
|
||||||
|
|
||||||
i_hier++;
|
if (n_remain != n_procs_in) {
|
||||||
|
i_hier++;
|
||||||
|
|
||||||
|
/* The way initialization is currently written *all* ranks MUST appear
|
||||||
|
* in the first level (0) of the hierarchy. If any rank is not in the first
|
||||||
|
* level then the calculation of gather/scatter offsets will be wrong.
|
||||||
|
* NTH: DO NOT REMOVE this assert until this changes! */
|
||||||
|
assert (i_hier || cum_number_ranks_in_all_subgroups == n_procs_in);
|
||||||
|
}
|
||||||
|
|
||||||
|
n_procs_in = n_remain;
|
||||||
}
|
}
|
||||||
|
|
||||||
SelectionDone:
|
SelectionDone:
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
||||||
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
||||||
|
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||||
|
* reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -16,9 +19,11 @@
|
|||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
#include "ompi/mca/sbgp/sbgp.h"
|
#include "ompi/mca/sbgp/sbgp.h"
|
||||||
|
#include "ompi/mca/sbgp/base/base.h"
|
||||||
#include "ompi/mca/mpool/mpool.h"
|
#include "ompi/mca/mpool/mpool.h"
|
||||||
#include "ompi/request/request.h"
|
#include "ompi/request/request.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
@ -29,33 +34,11 @@ BEGIN_C_DECLS
|
|||||||
# define SPIN
|
# define SPIN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define BASESMSOCKET_VERBOSE(level, ...) \
|
||||||
static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...)
|
do { \
|
||||||
{
|
OPAL_OUTPUT_VERBOSE((ompi_sbgp_base_framework.framework_output, level, \
|
||||||
va_list list;
|
__VA_ARGS__)); \
|
||||||
int ret;
|
|
||||||
|
|
||||||
va_start(list, fmt);
|
|
||||||
ret = vfprintf(stderr, fmt, list);
|
|
||||||
va_end(list);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_DEBUG
|
|
||||||
#define BASESMSOCKET_VERBOSE(level, args) \
|
|
||||||
do { \
|
|
||||||
if(10 >= level) { \
|
|
||||||
mca_sbgp_basesmsocket_err("[%s]%s[%s:%d:%s] BASESMSOCKET ", \
|
|
||||||
ompi_process_info.nodename, \
|
|
||||||
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
|
||||||
__FILE__, __LINE__, __func__); \
|
|
||||||
mca_sbgp_basesmsocket_err args; \
|
|
||||||
mca_sbgp_basesmsocket_err("\n"); \
|
|
||||||
} \
|
|
||||||
} while(0);
|
} while(0);
|
||||||
#else
|
|
||||||
#define BASESMSOCKET_VERBOSE(level, args)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Structure to hold the basic shared memory coll component. First it holds the
|
* Structure to hold the basic shared memory coll component. First it holds the
|
||||||
@ -66,7 +49,6 @@ static inline int mca_sbgp_basesmsocket_err(const char* fmt, ...)
|
|||||||
struct mca_sbgp_basesmsocket_component_t {
|
struct mca_sbgp_basesmsocket_component_t {
|
||||||
/** Base coll component */
|
/** Base coll component */
|
||||||
mca_sbgp_base_component_2_0_0_t super;
|
mca_sbgp_base_component_2_0_0_t super;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -172,7 +172,7 @@ static int mca_sbgp_map_to_logical_socket_id(int *socket)
|
|||||||
/* get this process' CPU binding */
|
/* get this process' CPU binding */
|
||||||
if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){
|
if( 0 != hwloc_get_cpubind(opal_hwloc_topology,good, 0)){
|
||||||
/* report some error */
|
/* report some error */
|
||||||
BASESMSOCKET_VERBOSE(10, ("The global variable opal_hwloc_topology appears not to have been initialized\n"));
|
BASESMSOCKET_VERBOSE(10, "The global variable opal_hwloc_topology appears not to have been initialized\n");
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -292,7 +292,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
|||||||
my_socket_index=-1;
|
my_socket_index=-1;
|
||||||
/*debug print*/
|
/*debug print*/
|
||||||
/* */
|
/* */
|
||||||
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank));
|
BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank);
|
||||||
/*end debug*/
|
/*end debug*/
|
||||||
goto NoLocalPeers;
|
goto NoLocalPeers;
|
||||||
} else {
|
} else {
|
||||||
@ -303,7 +303,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
|||||||
* by the hwloc API are unique.
|
* by the hwloc API are unique.
|
||||||
*/
|
*/
|
||||||
if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)){
|
if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)){
|
||||||
BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank));
|
BASESMSOCKET_VERBOSE(10, "[%d] FAILED to set basesmsocket group !!!\n",my_rank);
|
||||||
|
|
||||||
goto NoLocalPeers;
|
goto NoLocalPeers;
|
||||||
}
|
}
|
||||||
@ -394,7 +394,7 @@ static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_pr
|
|||||||
ret=comm_allgather_pml(&my_socket_info, socket_info, 1,
|
ret=comm_allgather_pml(&my_socket_info, socket_info, 1,
|
||||||
MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm);
|
MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm);
|
||||||
if (OMPI_SUCCESS != ret ) {
|
if (OMPI_SUCCESS != ret ) {
|
||||||
BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret));
|
BASESMSOCKET_VERBOSE(10, "comm_allgather_pml returned error %d\n",ret);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user