Ensure that ORTE processes such as mpirun and orted never inadvertently bind themselves to cores. Change the mca param name used by the rank_file mapper to get user directives on slot lists to be different from that used by MPI procs to discover their binding. Add a cmd line option to orterun to make it easier for a user to specify the slot list (basically, hide the mca param name).
Discussed and reviewed with Lenny and Jeff. This commit was SVN r19062.
Этот коммит содержится в:
родитель
5096512c3a
Коммит
3107545709
@ -71,6 +71,8 @@ typedef struct {
|
||||
bool display_map;
|
||||
/* balance load across nodes */
|
||||
bool loadbalance;
|
||||
/* slot list, if provided by user */
|
||||
char *slot_list;
|
||||
} orte_rmaps_base_t;
|
||||
|
||||
/**
|
||||
|
@ -127,6 +127,11 @@ int orte_rmaps_base_open(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* did the user provide a slot list? */
|
||||
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
|
||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
|
||||
false, false, NULL, &orte_rmaps_base.slot_list);
|
||||
|
||||
/* Should we schedule on the local node or not? */
|
||||
|
||||
mca_base_param_reg_int_name("rmaps", "base_no_schedule_local",
|
||||
|
@ -190,11 +190,8 @@ static int map_app_by_node(
|
||||
}
|
||||
/* Allocate a slot on this node */
|
||||
node = (orte_node_t*) cur_node_item;
|
||||
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
|
||||
node->slot_list = (char*) malloc(64*sizeof(char));
|
||||
if ( NULL != node->slot_list ) {
|
||||
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
||||
}
|
||||
if ( NULL != orte_rmaps_base.slot_list ) {
|
||||
node->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
@ -300,11 +297,8 @@ static int map_app_by_slot(
|
||||
++num_alloc;
|
||||
continue;
|
||||
}
|
||||
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
|
||||
node->slot_list = (char*) malloc(64*sizeof(char));
|
||||
if ( NULL != node->slot_list ) {
|
||||
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
||||
}
|
||||
if ( NULL != orte_rmaps_base.slot_list ) {
|
||||
node->slot_list = strdup(orte_rmaps_base.slot_list);
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
|
||||
nodes, jdata->map->oversubscribe, true))) {
|
||||
|
@ -23,11 +23,12 @@
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/ras/ras_types.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/rank_file/rmaps_rank_file.h"
|
||||
#include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h"
|
||||
@ -74,8 +75,6 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = {
|
||||
*/
|
||||
static int orte_rmaps_rank_file_open(void)
|
||||
{
|
||||
int index = 0;
|
||||
|
||||
mca_rmaps_rank_file_component.priority = 0;
|
||||
|
||||
mca_base_param_reg_string(&mca_rmaps_rank_file_component.super.base_version,
|
||||
@ -86,15 +85,10 @@ static int orte_rmaps_rank_file_open(void)
|
||||
mca_rmaps_rank_file_component.priority = 100;
|
||||
}
|
||||
|
||||
index = mca_base_param_find("opal", NULL, "paffinity_base_slot_list");
|
||||
if (index >= 0) {
|
||||
if (OPAL_SUCCESS == mca_base_param_lookup_string(index, &orte_mca_rmaps_rank_file_slot_list)) {
|
||||
if (NULL != orte_mca_rmaps_rank_file_slot_list) {
|
||||
mca_rmaps_rank_file_component.priority = 100;
|
||||
}
|
||||
}
|
||||
if (NULL != orte_rmaps_base.slot_list) {
|
||||
mca_rmaps_rank_file_component.priority = 100;
|
||||
}
|
||||
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -222,6 +222,9 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
{ "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Launch n processes per node on all allocated nodes" },
|
||||
{ "rmaps", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)" },
|
||||
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Nodes are not to be oversubscribed, even if the system supports such operation"},
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user