1
1

545 строки
20 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/opal_sos.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "rmaps_lb.h"
static int switchyard(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_load_balance_module = {
switchyard
};
/* Local functions */
static int npernode(orte_job_t *jdata);
static int nperboard(orte_job_t *jdata);
static int npersocket(orte_job_t *jdata);
static int loadbalance(orte_job_t *jdata);
static int switchyard(orte_job_t *jdata)
{
int rc;
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
mca_base_component_t *c = &mca_rmaps_load_balance_component.super.base_version;
/* only handle initial launch of loadbalanced
* or NPERxxx jobs - allow restarting of failed apps
*/
if (ORTE_JOB_STATE_INIT != jdata->state) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not in initial state - loadbalance cannot map",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lb: job %s not using loadbalance mapper",
ORTE_JOBID_PRINT(jdata->jobid));
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:loadbalance: mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* flag that I did the mapping */
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
jdata->map->last_mapper = strdup(c->mca_component_name);
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
if (0 < mca_rmaps_load_balance_component.npernode ||
0 < jdata->map->npernode) {
rc = npernode(jdata);
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
} else if (0 < mca_rmaps_load_balance_component.nperboard ||
0 < jdata->map->nperboard) {
rc = nperboard(jdata);
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
} else if (0 < mca_rmaps_load_balance_component.npersocket ||
0 < jdata->map->npersocket) {
rc = npersocket(jdata);
} else {
rc = loadbalance(jdata);
}
if (ORTE_SUCCESS != rc) {
return rc;
}
/* compute and save local ranks */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* define the daemons that we will use for this job */
if (ORTE_SUCCESS != (rc = orte_rmaps_base_define_daemons(jdata))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* place specified #procs on each node, up to the specified total
* number of procs (if one was given).
*/
static int npernode(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_nodes;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
num_nodes = opal_list_get_size(&node_list);
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
for (j=0; j < mca_rmaps_load_balance_component.npernode && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
j < mca_rmaps_load_balance_component.npernode-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
nprocs++;
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of nodes", num_nodes,
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
"npernode", mca_rmaps_load_balance_component.npernode);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int nperboard(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, k, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_boards=orte_default_num_boards;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_boards = node->boards;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && nprocs < np; k++) {
/* put the specified number of procs on each board */
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
for (j=0; j < mca_rmaps_load_balance_component.nperboard && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
j < mca_rmaps_load_balance_component.nperboard-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
nprocs++;
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of boards", num_boards,
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
"nperboard", mca_rmaps_load_balance_component.nperboard);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
static int npersocket(orte_job_t *jdata)
{
orte_app_context_t *app;
int j, k, n, rc=ORTE_SUCCESS;
opal_list_t node_list;
opal_list_item_t *item;
orte_std_cntr_t num_slots;
orte_node_t *node;
int np, nprocs;
int num_sockets=orte_default_num_sockets_per_board;
/* setup the node list */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* can only have one app_context here */
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* use the number of procs if one was given */
if (0 < app->num_procs) {
np = app->num_procs;
} else {
np = INT_MAX;
}
/* for each app_context, we have to get the list of nodes that it can
* use since that can now be modified with a hostfile and/or -host
* option
*/
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
/* loop through the list of nodes */
nprocs = 0;
while (NULL != (item = opal_list_remove_first(&node_list))) {
node = (orte_node_t*)item;
num_sockets = node->sockets_per_board;
/* loop through the number of boards in this node */
for (k=0; k < node->boards && nprocs < np; k++) {
/* loop through the number of sockets/board */
for (n=0; n < node->sockets_per_board && nprocs < np; n++) {
/* put the specified number of procs on each socket */
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
for (j=0; j < mca_rmaps_load_balance_component.npersocket && nprocs < np; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
j < mca_rmaps_load_balance_component.npersocket-1) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(node);
goto error;
}
}
/* track the number of procs */
nprocs++;
}
}
}
OBJ_RELEASE(node);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of sockets", num_sockets,
George raised some valid concerns about the extensibility of the revised rmaps framework. Address those by: 1. removing the enum of mapper values 2. change the req_mapper and last_mapper fields to char* so they can hold the component name instead of a mapper flag 3. revise the selection logic in the mapper components to reflect the change. Components now look for their name in the req_mapper field, or to see if other criteria (e.g., npernode) are set that mandate their doing the mapping Several MCA params resided in the rmaps base for historical reasons - they have been in the base since at least the original 1.2 release (and perhaps earlier). However, George correctly pointed out that they really should reside in their respective components. Accordingly, move them to the components, but register synonyms to the old names to avoid breaking backward compatibility. These revisions retain the current functionality of allowing comm_spawn'd jobs to use different mappers than the original job, and for the errmgr to utilize the resilient mapper to recover processes regardless of how they were originally mapped. Given the large number of possible combinations, I am sure that someone will find a corner-case combination of values and selection criteria that cause either no mapper to be selected, or one other than the intended to be used. No one can test all the ways people will use this system, so I expect debugging to continue for awhile. The ability of comm_spawn'd jobs to exploit this functionality relies on changes to the orte_dpm component - this will be committed separately. This commit was SVN r24520.
2011-03-12 05:30:09 +00:00
"npersocket", mca_rmaps_load_balance_component.npersocket);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
error:
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}
/*
* Create a load balanced mapping for the job by assigning a constant #procs/node, with
* leftovers being spread one/node starting from the first node.
*/
static int loadbalance(orte_job_t *jdata)
{
orte_app_context_t *app;
int i, j;
opal_list_t node_list;
orte_std_cntr_t num_nodes, num_slots;
int rc=ORTE_SUCCESS, np, nprocs;
int ppn = 0;
opal_list_item_t *item, *start;
orte_node_t *node;
/* setup */
OBJ_CONSTRUCT(&node_list, opal_list_t);
/* compute total #procs we are going to add and the total number of nodes available */
for(i=0; i < jdata->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
continue;
}
/* get the nodes and #slots available for this app_context */
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
jdata->map->policy))) {
ORTE_ERROR_LOG(rc);
goto error;
}
if (0 < app->num_procs) {
np = app->num_procs;
} else {
/* set the num_procs to the #slots */
np = num_slots;
}
num_nodes = opal_list_get_size(&node_list);
/* compute the base ppn */
ppn = np / num_nodes;
/* if a bookmark exists from some prior mapping, set us to start there */
start = orte_rmaps_base_get_starting_point(&node_list, jdata);
/* loop through the list of nodes until we either assign all the procs
* or return to the starting point
*/
item = start;
nprocs = 0;
do {
node = (orte_node_t*)item;
/* put the specified number of procs on each node */
for (j=0; j < ppn; j++) {
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
* more procs to place, then that is an error
*/
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc) ||
j < ppn-1) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
nprocs++;
}
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
} while (item != start && nprocs < np);
/* save the bookmark */
jdata->bookmark = node;
/* if we haven't assigned all the procs, then loop through the list
* again, assigning 1 per node until all are assigned
*/
item = start;
while (nprocs < np) {
node = (orte_node_t*)item;
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
jdata->map->cpus_per_rank, app->idx,
&node_list, jdata->map->oversubscribe,
false, NULL))) {
/* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
if (ORTE_ERR_NODE_FULLY_USED != OPAL_SOS_GET_ERROR_CODE(rc)) {
ORTE_ERROR_LOG(rc);
goto error;
}
}
nprocs++;
/* move to next node */
if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
item = opal_list_get_first(&node_list);
}
else {
item = opal_list_get_next(item);
}
}
/* save the bookmark */
jdata->bookmark = node;
/* cleanup */
while (NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
/* if the user requested a specific number of procs and
* the total number of procs we were able to assign
* doesn't equal the number requested, then we have a
* problem
*/
if (0 < app->num_procs && nprocs < app->num_procs) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
app->app, app->num_procs,
"number of slots", nprocs,
"number of nodes", num_nodes);
return ORTE_ERR_SILENT;
}
/* update the number of procs in the job */
jdata->num_procs += nprocs;
/* compute vpids and add proc objects to the job - this has to be
* done after each app_context is mapped in order to keep the
* vpids contiguous within an app_context
*/
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
error:
while(NULL != (item = opal_list_remove_first(&node_list))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&node_list);
return rc;
}