1
1

Ensure that ORTE processes such as mpirun and orted never inadvertently bind themselves to cores. Change the mca param name used by the rank_file mapper to get user directives on slot lists to be different from that used by MPI procs to discover their binding. Add a cmd line option to orterun to make it easier for a user to specify the slot list (basically, hide the mca param name).

Discussed and reviewed with Lenny and Jeff.

This commit was SVN r19062.
Этот коммит содержится в:
Ralph Castain 2008-07-28 14:18:36 +00:00
родитель 5096512c3a
Коммит 3107545709
5 изменённых файлов: 19 добавлений и 21 удалений

Просмотреть файл

@ -71,6 +71,8 @@ typedef struct {
bool display_map;
/* balance load across nodes */
bool loadbalance;
/* slot list, if provided by user */
char *slot_list;
} orte_rmaps_base_t;
/**

Просмотреть файл

@ -127,6 +127,11 @@ int orte_rmaps_base_open(void)
}
}
/* did the user provide a slot list? */
param = mca_base_param_reg_string_name("rmaps", "base_slot_list",
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files) [default=NULL]",
false, false, NULL, &orte_rmaps_base.slot_list);
/* Should we schedule on the local node or not? */
mca_base_param_reg_int_name("rmaps", "base_no_schedule_local",

Просмотреть файл

@ -190,11 +190,8 @@ static int map_app_by_node(
}
/* Allocate a slot on this node */
node = (orte_node_t*) cur_node_item;
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
node->slot_list = (char*) malloc(64*sizeof(char));
if ( NULL != node->slot_list ) {
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
}
if ( NULL != orte_rmaps_base.slot_list ) {
node->slot_list = strdup(orte_rmaps_base.slot_list);
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
nodes, jdata->map->oversubscribe, true))) {
@ -300,11 +297,8 @@ static int map_app_by_slot(
++num_alloc;
continue;
}
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
node->slot_list = (char*) malloc(64*sizeof(char));
if ( NULL != node->slot_list ) {
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
}
if ( NULL != orte_rmaps_base.slot_list ) {
node->slot_list = strdup(orte_rmaps_base.slot_list);
}
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
nodes, jdata->map->oversubscribe, true))) {

Просмотреть файл

@ -23,11 +23,12 @@
#include "orte/runtime/orte_globals.h"
#include "orte/mca/ras/ras_types.h"
#include "orte/util/show_help.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/util/show_help.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/rank_file/rmaps_rank_file.h"
#include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h"
@ -74,8 +75,6 @@ orte_rmaps_rank_file_component_t mca_rmaps_rank_file_component = {
*/
static int orte_rmaps_rank_file_open(void)
{
int index = 0;
mca_rmaps_rank_file_component.priority = 0;
mca_base_param_reg_string(&mca_rmaps_rank_file_component.super.base_version,
@ -86,15 +85,10 @@ static int orte_rmaps_rank_file_open(void)
mca_rmaps_rank_file_component.priority = 100;
}
index = mca_base_param_find("opal", NULL, "paffinity_base_slot_list");
if (index >= 0) {
if (OPAL_SUCCESS == mca_base_param_lookup_string(index, &orte_mca_rmaps_rank_file_slot_list)) {
if (NULL != orte_mca_rmaps_rank_file_slot_list) {
mca_rmaps_rank_file_component.priority = 100;
}
}
if (NULL != orte_rmaps_base.slot_list) {
mca_rmaps_rank_file_component.priority = 100;
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -222,6 +222,9 @@ static opal_cmd_line_init_t cmd_line_init[] = {
{ "rmaps", "base", "n_pernode", '\0', "npernode", "npernode", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Launch n processes per node on all allocated nodes" },
{ "rmaps", "base", "slot_list", '\0', "slot-list", "slot-list", 1,
NULL, OPAL_CMD_LINE_TYPE_STRING,
"List of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)" },
{ "rmaps", "base", "no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Nodes are not to be oversubscribed, even if the system supports such operation"},