1
1
openmpi/ompi/mca/coll/hcoll/coll_hcoll_component.c
Mike Dubman b8550a55a7 HCOLL: many fixes
Adds coll_hcoll_np mca parameter similar to that of fca component (defaults to 32). Those who use hcoll be aware that from now on the communicators less than 32 procs will run w/o hcoll by default. - Resolves fallback issue in case libhcoll runs out of allowed contexts. The solution is moving hcoll_context_create from comm_enable to comm_query. Shortly, comm_enable should never return OMPI_ERROR in the coll component with highest priority (hcoll). Otherwise the ompi coll_base_select will unselect the coll funtion pointers and module references leaving the communicator w/o coll pointer. This will cause the fail. Same behavior can be reproduced even with tuned if one would hardcore some "return OMPI_ERROR" into it's module_enable funtion. - Additionally, removed all the dead code under #if 0; removed unused variables (path for library, active_modules list) and classes (module list wrapper)

Fixed by Val, Reviewed by Devendar/Josh/Miked

cmr=v1.7.4:reviewer=ompi-rm1.7

This commit was SVN r30341.
2014-01-21 12:19:47 +00:00

248 строки
6.2 KiB
C

/**
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
$COPYRIGHT$
Additional copyrights may follow
$HEADER$
*/
#include "ompi_config.h"
#include <stdio.h>
#include <dlfcn.h>
#include <libgen.h>
#include "coll_hcoll.h"
#include "opal/mca/installdirs/installdirs.h"
/*
* Public string showing the coll ompi_hcol component version number
*/
const char *mca_coll_hcoll_component_version_string =
"Open MPI HCOL collective MCA component version " OMPI_VERSION;
static int hcoll_open(void);
static int hcoll_close(void);
static int hcoll_register(void);
int mca_coll_hcoll_output = -1;
mca_coll_hcoll_component_t mca_coll_hcoll_component = {
/* First, the mca_component_t struct containing meta information
about the component itfca */
{
{
MCA_COLL_BASE_VERSION_2_0_0,
/* Component name and version */
"hcoll",
OMPI_MAJOR_VERSION,
OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION,
/* Component open and close functions */
hcoll_open,
hcoll_close,
NULL,
hcoll_register
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
},
/* Initialization / querying functions */
mca_coll_hcoll_init_query,
mca_coll_hcoll_comm_query,
},
90, /* priority */
0, /* verbose level */
1 /* hcoll_enable */
};
int mca_coll_hcoll_get_lib(void)
{
memset(&mca_coll_hcoll_component.hcoll_ops,
0, sizeof(mca_coll_hcoll_component.hcoll_ops));
return OMPI_SUCCESS;
}
/*
* * Local flags
* */
enum {
REGINT_NEG_ONE_OK = 0x01,
REGINT_GE_ZERO = 0x02,
REGINT_GE_ONE = 0x04,
REGINT_NONZERO = 0x08,
REGINT_MAX = 0x88
};
enum {
REGSTR_EMPTY_OK = 0x01,
REGSTR_MAX = 0x88
};
/*
* utility routine for string parameter registration
*/
static int reg_string(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
const char* default_value, char **storage,
int flags)
{
int index;
*storage = (char *) default_value;
index = mca_base_component_var_register(
&mca_coll_hcoll_component.super.collm_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_STRING,
NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index,
"ompi", "coll", "hcoll", deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGSTR_EMPTY_OK) &&
(NULL == *storage || 0 == strlen(*storage))) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
/*
* Utility routine for integer parameter registration
*/
static int reg_int(const char* param_name,
const char* deprecated_param_name,
const char* param_desc,
int default_value, int *storage, int flags)
{
int index;
*storage = default_value;
index = mca_base_component_var_register(
&mca_coll_hcoll_component.super.collm_version,
param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0,OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, storage);
if (NULL != deprecated_param_name) {
(void) mca_base_var_register_synonym(index,
"ompi", "coll", "hcoll", deprecated_param_name,
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) {
return OMPI_SUCCESS;
}
if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) ||
(0 != (flags & REGINT_GE_ONE) && *storage < 1) ||
(0 != (flags & REGINT_NONZERO) && 0 == *storage)) {
opal_output(0, "Bad parameter value for parameter \"%s\"",
param_name);
return OMPI_ERR_BAD_PARAM;
}
return OMPI_SUCCESS;
}
static int hcoll_register(void)
{
int ret, tmp;
ret = OMPI_SUCCESS;
#define CHECK(expr) do { \
tmp = (expr); \
if (OMPI_SUCCESS != tmp) ret = tmp; \
} while (0)
CHECK(reg_int("priority",NULL,
"Priority of the hcol coll component",
90,
&mca_coll_hcoll_component.hcoll_priority,
0));
CHECK(reg_int("verbose", NULL,
"Verbose level of the hcol coll component",
0,
&mca_coll_hcoll_component.hcoll_verbose,
0));
CHECK(reg_int("enable",NULL,
"[1|0|] Enable/Disable HCOL",
1 /*enable by default*/,
&mca_coll_hcoll_component.hcoll_enable,
0));
CHECK(reg_int("np",NULL,
"Minimal number of processes in the communicator"
" for the corresponding hcoll context to be created (default: 32)",
2 /*enable by default*/,
&mca_coll_hcoll_component.hcoll_np,
0));
CHECK(reg_int("datatype_fallback",NULL,
"[1|0|] Enable/Disable user defined dattypes fallback",
1 /*enable by default*/,
&mca_coll_hcoll_component.hcoll_datatype_fallback,
0));
return ret;
}
static int hcoll_open(void)
{
int rc;
mca_coll_hcoll_component_t *cm;
cm = &mca_coll_hcoll_component;
mca_coll_hcoll_output = opal_output_open(NULL);
opal_output_set_verbosity(mca_coll_hcoll_output, cm->hcoll_verbose);
hcoll_rte_fns_setup();
cm->libhcoll_initialized = false;
return OMPI_SUCCESS;
}
static int hcoll_close(void)
{
int rc;
mca_coll_hcoll_component_t *cm;
cm = &mca_coll_hcoll_component;
if (false == cm->libhcoll_initialized) {
return OMPI_SUCCESS;
}
HCOL_VERBOSE(5,"HCOLL FINALIZE");
rc = hcoll_finalize();
opal_progress_unregister(mca_coll_hcoll_progress);
if (HCOLL_SUCCESS != rc){
HCOL_VERBOSE(1,"Hcol library finalize failed");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}