New flag for MCA parameters that allows a behaving with a default value
of "unset". mtl/psm2: Update some shadow mca parameters to use the default "unset". mtl/psm2: Add new shadow parameter to allow specifying the service level. Signed-off-by: Matias A Cabral <matias.a.cabral@intel.com>
Этот коммит содержится в:
родитель
97d0469719
Коммит
1fad59465f
@ -92,29 +92,33 @@ struct ompi_mtl_psm2_shadow_variable {
|
||||
mca_base_var_info_lvl_t info_level;
|
||||
const char *mca_name;
|
||||
const char *description;
|
||||
mca_base_var_flag_t flags;
|
||||
};
|
||||
|
||||
struct ompi_mtl_psm2_shadow_variable ompi_mtl_psm2_shadow_variables[] = {
|
||||
{MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_devices, {.stringval = "self,shm,hfi"}, "PSM2_DEVICES", OPAL_INFO_LVL_3,
|
||||
"devices", "Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi)"},
|
||||
"devices",
|
||||
"Comma-delimited list of PSM2 devices. Valid values: self, shm, hfi (default: self,shm,hfi. Reduced to self,shm in single node jobs)",0},
|
||||
{MCA_BASE_VAR_TYPE_STRING, &ompi_mtl_psm2.psm2_memory, {.stringval = "normal"}, "PSM2_MEMORY", OPAL_INFO_LVL_9,
|
||||
"memory_model", "PSM2 memory usage mode (default: normal)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 1048576}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
|
||||
"mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: 1M)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 1048576}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
|
||||
"mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default: 1M)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 64000}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
|
||||
"hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: 64000)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 16000}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
|
||||
"shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: 16000)"},
|
||||
"memory_model", "PSM2 memory usage mode. Valid values: min, normal, large (default: normal)", 0},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_sendreqs_max, {.ulval = 0}, "PSM2_MQ_SENDREQS_MAX", OPAL_INFO_LVL_3,
|
||||
"mq_sendreqs_max", "PSM2 maximum number of isend requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_recvreqs_max, {.ulval = 0}, "PSM2_MQ_RECVREQS_MAX", OPAL_INFO_LVL_3,
|
||||
"mq_recvreqs_max", "PSM2 maximum number of irecv requests in flight (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_hfi_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_HFI_THRESH", OPAL_INFO_LVL_3,
|
||||
"hfi_eager_limit", "PSM2 eager to rendezvous threshold (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_mq_rndv_shm_threshold, {.ulval = 0}, "PSM2_MQ_RNDV_SHM_THRESH", OPAL_INFO_LVL_3,
|
||||
"shm_eager_limit", "PSM2 shared memory eager to rendezvous threshold (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_recvthread, {.boolval = true}, "PSM2_RCVTHREAD", OPAL_INFO_LVL_3,
|
||||
"use_receive_thread", "Use PSM2 progress thread (default: true)"},
|
||||
{MCA_BASE_VAR_TYPE_BOOL, &ompi_mtl_psm2.psm2_shared_contexts, {.boolval = true}, "PSM2_SHAREDCONTEXTS", OPAL_INFO_LVL_6,
|
||||
"use_shared_contexts", "Share PSM contexts between MPI processes (default: true)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_shared_contexts_max, {.ulval = 8}, "PSM2_SHAREDCONTEXTS_MAX", OPAL_INFO_LVL_9,
|
||||
"max_shared_contexts", "Maximum number of contexts available on a node (default: 8, max: 8)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_max_contexts_per_job, {.ulval = 0}, "PSM2_MAX_CONTEXTS_PER_JOB", OPAL_INFO_LVL_9,
|
||||
"max_contexts_per_job", "Maximum number of contexts available on a node (default: unset, let libpsm2 use its default)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_tracemask, {.ulval = 1}, "PSM2_TRACEMASK", OPAL_INFO_LVL_9,
|
||||
"trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values (default: 1)"},
|
||||
"trace_mask", "PSM2 tracemask value. See PSM2 documentation for accepted values in 0x (default: 1)"},
|
||||
{MCA_BASE_VAR_TYPE_UNSIGNED_LONG, &ompi_mtl_psm2.psm2_opa_sl, {.ulval = 0}, "HFI_SL", OPAL_INFO_LVL_9,
|
||||
"opa_service_level", "HFI Service Level (default: unset, let libpsm2 use its defaults)", MCA_BASE_VAR_FLAG_DEF_UNSET},
|
||||
{-1},
|
||||
};
|
||||
|
||||
@ -123,10 +127,27 @@ static void ompi_mtl_psm2_set_shadow_env (struct ompi_mtl_psm2_shadow_variable *
|
||||
mca_base_var_storage_t *storage = variable->storage;
|
||||
char *env_value;
|
||||
int ret = 0;
|
||||
int var_index = 0;
|
||||
const mca_base_var_t *mca_base_var;
|
||||
|
||||
var_index = mca_base_var_find("ompi", "mtl", "psm2", variable->mca_name);
|
||||
ret = mca_base_var_get (var_index,&mca_base_var);
|
||||
/* Something is fundamentally broken if registered variables are
|
||||
* not found */
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
fprintf (stderr, "ERROR setting PSM2 environment variable: %s\n", variable->env_name);
|
||||
return;
|
||||
}
|
||||
|
||||
/** Skip setting variables for which the default behavior is "unset" */
|
||||
if ((mca_base_var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
|
||||
(MCA_BASE_VAR_SOURCE_DEFAULT == mca_base_var->mbv_source)){
|
||||
return ;
|
||||
}
|
||||
|
||||
switch (variable->variable_type) {
|
||||
case MCA_BASE_VAR_TYPE_BOOL:
|
||||
ret = asprintf (&env_value, "%s=%s", variable->env_name, storage->boolval ? "YES" : "NO");
|
||||
ret = asprintf (&env_value, "%s=%d", variable->env_name, storage->boolval ? 1 : 0);
|
||||
break;
|
||||
case MCA_BASE_VAR_TYPE_UNSIGNED_LONG:
|
||||
if (0 == strcmp (variable->env_name, "PSM2_TRACEMASK")) {
|
||||
@ -182,7 +203,7 @@ static void ompi_mtl_psm2_register_shadow_env (struct ompi_mtl_psm2_shadow_varia
|
||||
}
|
||||
|
||||
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, variable->mca_name, variable->description,
|
||||
variable->variable_type, NULL, 0, 0, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
variable->variable_type, NULL, 0, variable->flags, variable->info_level, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
variable->storage);
|
||||
}
|
||||
|
||||
@ -221,8 +242,10 @@ ompi_mtl_psm2_component_register(void)
|
||||
(void) get_num_total_procs(&num_total_procs);
|
||||
|
||||
/* set priority high enough to beat ob1's default (also set higher than psm) */
|
||||
if (num_local_procs == num_total_procs) {
|
||||
/* disable hfi if all processes are local */
|
||||
if ((num_local_procs == num_total_procs) && (1 < num_total_procs)) {
|
||||
/* Disable hfi if all processes are local. However, if running only one
|
||||
* process assume it is ompi_info or this is most likely going to spawn, for
|
||||
* which all PSM2 devices are needed */
|
||||
setenv("PSM2_DEVICES", "self,shm", 0);
|
||||
/* ob1 is much faster than psm2 with shared memory */
|
||||
param_priority = 10;
|
||||
|
@ -56,10 +56,11 @@ struct mca_mtl_psm2_module_t {
|
||||
unsigned long psm2_mq_recvreqs_max;
|
||||
unsigned long psm2_mq_rndv_hfi_threshold;
|
||||
unsigned long psm2_mq_rndv_shm_threshold;
|
||||
unsigned long psm2_shared_contexts_max;
|
||||
unsigned long psm2_max_contexts_per_job;
|
||||
unsigned long psm2_tracemask;
|
||||
bool psm2_recvthread;
|
||||
bool psm2_shared_contexts;
|
||||
unsigned long psm2_opa_sl;
|
||||
};
|
||||
|
||||
typedef struct mca_mtl_psm2_module_t mca_mtl_psm2_module_t;
|
||||
|
@ -1896,6 +1896,14 @@ static int var_value_string (mca_base_var_t *var, char **value_string)
|
||||
|
||||
assert (MCA_BASE_VAR_TYPE_MAX > var->mbv_type);
|
||||
|
||||
/** Parameters with MCA_BASE_VAR_FLAG_DEF_UNSET flag should be shown
|
||||
* as "unset" by default. */
|
||||
if ((var->mbv_flags & MCA_BASE_VAR_FLAG_DEF_UNSET) &&
|
||||
(MCA_BASE_VAR_SOURCE_DEFAULT == var->mbv_source)){
|
||||
asprintf (value_string, "%s", "unset");
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
ret = mca_base_var_get_value(var->mbv_index, &value, NULL, NULL);
|
||||
if (OPAL_SUCCESS != ret || NULL == value) {
|
||||
return ret;
|
||||
|
@ -191,7 +191,10 @@ typedef enum {
|
||||
manually when you register a variable with
|
||||
mca_base_var_register(). Analogous to the
|
||||
MCA_BASE_PVAR_FLAG_IWG. */
|
||||
MCA_BASE_VAR_FLAG_DWG = 0x0040
|
||||
MCA_BASE_VAR_FLAG_DWG = 0x0040,
|
||||
/** Variable has a default value of "unset". Meaning to only
|
||||
* be set when the user explicitly asks for it */
|
||||
MCA_BASE_VAR_FLAG_DEF_UNSET = 0x0080,
|
||||
} mca_base_var_flag_t;
|
||||
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user