1. Fixing Possible strdup of NULL
2. Fixing num_alloc when combined mapping policies ( rankfile & byslot or bynode ) This commit was SVN r18073.
Этот коммит содержится в:
родитель
f115b4aed2
Коммит
2be4e32c79
@ -31,20 +31,17 @@ example: cat hosfile
|
|||||||
rank 3=host3 slot=0:1,1:0-2
|
rank 3=host3 slot=0:1,1:0-2
|
||||||
|
|
||||||
[parse_error_string]
|
[parse_error_string]
|
||||||
Open RTE detected a parse error in the rankfile:
|
Open RTE detected a parse error in the rankfile (%s)
|
||||||
%s
|
|
||||||
It occured on line number %d on token %d:
|
It occured on line number %d on token %d:
|
||||||
%s
|
%s
|
||||||
|
|
||||||
[parse_error_int]
|
[parse_error_int]
|
||||||
Open RTE detected a parse error in the rankfile:
|
Open RTE detected a parse error in the rankfile (%s)
|
||||||
%s
|
|
||||||
It occured on line number %d on token %d:
|
It occured on line number %d on token %d:
|
||||||
%d
|
%d
|
||||||
|
|
||||||
[parse_error]
|
[parse_error]
|
||||||
Open RTE detected a parse error in the rankfile:
|
Open RTE detected a parse error in the rankfile (%s)
|
||||||
%s
|
|
||||||
It occured on line number %d on token %d.
|
It occured on line number %d on token %d.
|
||||||
|
|
||||||
[not-all-mapped-alloc]
|
[not-all-mapped-alloc]
|
||||||
@ -66,13 +63,13 @@ Either request fewer slots for your application, or make more slots available
|
|||||||
for use.
|
for use.
|
||||||
|
|
||||||
[bad-rankfile]
|
[bad-rankfile]
|
||||||
Error, Invalid rank (%d) in the rankfile (%s)
|
Error, invalid rank (%d) in the rankfile (%s)
|
||||||
|
|
||||||
[bad-assign]
|
[bad-assign]
|
||||||
Error, rank %d is already assigned to %s, check %s
|
Error, rank %d is already assigned to %s, check %s
|
||||||
|
|
||||||
[bad-syntax]
|
[bad-syntax]
|
||||||
Error, invalid syntax in the rankfile
|
Error, invalid syntax in the rankfile (%s)
|
||||||
syntax must be the fallowing
|
syntax must be the fallowing
|
||||||
rank i=host_i slot=string
|
rank i=host_i slot=string
|
||||||
ex: rank 1=host1 slot=1:0,1
|
ex: rank 1=host1 slot=1:0,1
|
||||||
|
@ -54,6 +54,7 @@ char *orte_rmaps_rank_file_path = NULL;
|
|||||||
static const char *orte_rmaps_rank_file_name_cur = NULL;
|
static const char *orte_rmaps_rank_file_name_cur = NULL;
|
||||||
static opal_mutex_t orte_rmaps_rank_file_mutex;
|
static opal_mutex_t orte_rmaps_rank_file_mutex;
|
||||||
char *orte_rmaps_rank_file_slot_list;
|
char *orte_rmaps_rank_file_slot_list;
|
||||||
|
static orte_std_cntr_t orte_rmaps_rank_file_num_alloc = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local variable
|
* Local variable
|
||||||
@ -72,11 +73,11 @@ static int map_app_by_user_map(
|
|||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
orte_std_cntr_t num_alloc=0, round_cnt;
|
orte_std_cntr_t round_cnt;
|
||||||
|
|
||||||
OPAL_TRACE(2);
|
OPAL_TRACE(2);
|
||||||
|
|
||||||
while (num_alloc < app->num_procs) {
|
while (orte_rmaps_rank_file_num_alloc < app->num_procs) {
|
||||||
/** see if any nodes remain unused and available. We need to do this check
|
/** see if any nodes remain unused and available. We need to do this check
|
||||||
* each time since we may remove nodes from the list (as they become fully
|
* each time since we may remove nodes from the list (as they become fully
|
||||||
* used) as we cycle through the loop */
|
* used) as we cycle through the loop */
|
||||||
@ -91,7 +92,7 @@ static int map_app_by_user_map(
|
|||||||
* we may need to prune the nodes list removing overused nodes.
|
* we may need to prune the nodes list removing overused nodes.
|
||||||
* Wrap around to beginning if we are at the end of the list */
|
* Wrap around to beginning if we are at the end of the list */
|
||||||
round_cnt=0;
|
round_cnt=0;
|
||||||
if ( -1 != rankmap[vpid_start + num_alloc].rank) {
|
if ( -1 != rankmap[vpid_start + orte_rmaps_rank_file_num_alloc].rank) {
|
||||||
do {
|
do {
|
||||||
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
if (opal_list_get_end(nodes) == opal_list_get_next(cur_node_item)) {
|
||||||
next = opal_list_get_first(nodes);
|
next = opal_list_get_first(nodes);
|
||||||
@ -103,17 +104,17 @@ static int map_app_by_user_map(
|
|||||||
node = (orte_node_t*) cur_node_item;
|
node = (orte_node_t*) cur_node_item;
|
||||||
cur_node_item = next;
|
cur_node_item = next;
|
||||||
if ( round_cnt == 2 ) {
|
if ( round_cnt == 2 ) {
|
||||||
opal_show_help("help-rmaps_rank_file.txt","bad-host", true,rankmap[num_alloc+vpid_start].node_name);
|
opal_show_help("help-rmaps_rank_file.txt","bad-host", true,rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].node_name);
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||||
return ORTE_ERR_BAD_PARAM;
|
return ORTE_ERR_BAD_PARAM;
|
||||||
}
|
}
|
||||||
} while ( strcmp(node->name, rankmap[num_alloc + vpid_start].node_name));
|
} while ( strcmp(node->name, rankmap[orte_rmaps_rank_file_num_alloc + vpid_start].node_name));
|
||||||
node->slot_list = strdup(rankmap[num_alloc+vpid_start].slot_list);
|
node->slot_list = strdup(rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].slot_list);
|
||||||
if (mca_rmaps_rank_file_component.debug) {
|
if (mca_rmaps_rank_file_component.debug) {
|
||||||
opal_output(0, "rank_file RMAPS component: [%s:%d]->slot_list=%s\n",
|
opal_output(0, "rank_file RMAPS component: [%s:%d]->slot_list=%s\n",
|
||||||
rankmap[num_alloc + vpid_start].node_name,rankmap[num_alloc+vpid_start].rank, node->slot_list);
|
rankmap[orte_rmaps_rank_file_num_alloc + vpid_start].node_name,rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].rank, node->slot_list);
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rankmap[num_alloc+vpid_start].rank, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, rankmap[orte_rmaps_rank_file_num_alloc+vpid_start].rank, app->idx,
|
||||||
nodes, jdata->map->oversubscribe))) {
|
nodes, jdata->map->oversubscribe))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
@ -126,7 +127,7 @@ static int map_app_by_user_map(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++num_alloc;
|
++orte_rmaps_rank_file_num_alloc;
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -146,7 +147,6 @@ static int map_app_by_node(
|
|||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
orte_std_cntr_t num_alloc = 0;
|
|
||||||
|
|
||||||
OPAL_TRACE(2);
|
OPAL_TRACE(2);
|
||||||
|
|
||||||
@ -167,9 +167,9 @@ static int map_app_by_node(
|
|||||||
list, oversubscription is automatically taken care of via this logic.
|
list, oversubscription is automatically taken care of via this logic.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
while (num_alloc < app->num_procs) {
|
while (orte_rmaps_rank_file_num_alloc < app->num_procs) {
|
||||||
if ( -1 != rankmap[num_alloc + vpid_start].rank) {
|
if ( -1 != rankmap[orte_rmaps_rank_file_num_alloc + vpid_start].rank) {
|
||||||
++num_alloc;
|
++orte_rmaps_rank_file_num_alloc;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/** see if any nodes remain unused and available. We need to do this check
|
/** see if any nodes remain unused and available. We need to do this check
|
||||||
@ -199,7 +199,7 @@ static int map_app_by_node(
|
|||||||
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + orte_rmaps_rank_file_num_alloc, app->idx,
|
||||||
nodes, jdata->map->oversubscribe))) {
|
nodes, jdata->map->oversubscribe))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
@ -211,7 +211,7 @@ static int map_app_by_node(
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++num_alloc;
|
++orte_rmaps_rank_file_num_alloc;
|
||||||
cur_node_item = next;
|
cur_node_item = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,7 +229,7 @@ static int map_app_by_slot(
|
|||||||
opal_list_t* nodes )
|
opal_list_t* nodes )
|
||||||
{
|
{
|
||||||
int rc = ORTE_SUCCESS;
|
int rc = ORTE_SUCCESS;
|
||||||
orte_std_cntr_t i, num_slots_to_take, num_alloc = 0;
|
orte_std_cntr_t i, num_slots_to_take;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
opal_list_item_t *next;
|
opal_list_item_t *next;
|
||||||
|
|
||||||
@ -241,8 +241,7 @@ static int map_app_by_slot(
|
|||||||
have reached their soft limit and the user directed us to "no oversubscribe".
|
have reached their soft limit and the user directed us to "no oversubscribe".
|
||||||
If we still have processes that haven't been mapped yet, then it's an
|
If we still have processes that haven't been mapped yet, then it's an
|
||||||
"out of resources" error. */
|
"out of resources" error. */
|
||||||
|
while ( orte_rmaps_rank_file_num_alloc < app->num_procs) {
|
||||||
while ( num_alloc < app->num_procs) {
|
|
||||||
/** see if any nodes remain unused and available. We need to do this check
|
/** see if any nodes remain unused and available. We need to do this check
|
||||||
* each time since we may remove nodes from the list (as they become fully
|
* each time since we may remove nodes from the list (as they become fully
|
||||||
* used) as we cycle through the loop */
|
* used) as we cycle through the loop */
|
||||||
@ -300,8 +299,8 @@ static int map_app_by_slot(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < num_slots_to_take; ++i) {
|
for( i = 0; i < num_slots_to_take; ++i) {
|
||||||
if ( -1 != rankmap[num_alloc + vpid_start].rank) {
|
if ( -1 != rankmap[orte_rmaps_rank_file_num_alloc + vpid_start].rank) {
|
||||||
++num_alloc;
|
++orte_rmaps_rank_file_num_alloc;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
|
if ( NULL != orte_mca_rmaps_rank_file_slot_list){
|
||||||
@ -310,7 +309,7 @@ static int map_app_by_slot(
|
|||||||
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
strcpy(node->slot_list, orte_mca_rmaps_rank_file_slot_list);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + num_alloc, app->idx,
|
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, vpid_start + orte_rmaps_rank_file_num_alloc, app->idx,
|
||||||
nodes, jdata->map->oversubscribe))) {
|
nodes, jdata->map->oversubscribe))) {
|
||||||
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
/** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
|
||||||
* really isn't an error - we just need to break from the loop
|
* really isn't an error - we just need to break from the loop
|
||||||
@ -323,12 +322,12 @@ static int map_app_by_slot(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Update the number of procs allocated */
|
/* Update the number of procs allocated */
|
||||||
++num_alloc;
|
++orte_rmaps_rank_file_num_alloc;
|
||||||
|
|
||||||
/** if all the procs have been mapped OR we have fully used up this node, then
|
/** if all the procs have been mapped OR we have fully used up this node, then
|
||||||
* break from the loop
|
* break from the loop
|
||||||
*/
|
*/
|
||||||
if(num_alloc == app->num_procs || ORTE_ERR_NODE_FULLY_USED == rc) {
|
if(orte_rmaps_rank_file_num_alloc == app->num_procs || ORTE_ERR_NODE_FULLY_USED == rc) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -626,7 +625,7 @@ static int orte_rmaps_rank_file_parse(const char *rankfile, int np)
|
|||||||
node_name = strdup(argv[1]);
|
node_name = strdup(argv[1]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
opal_show_help("help-rmaps_rank_file.txt", "bad-syntax", true);
|
opal_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
|
||||||
rc = ORTE_ERR_BAD_PARAM;
|
rc = ORTE_ERR_BAD_PARAM;
|
||||||
goto unlock;
|
goto unlock;
|
||||||
}
|
}
|
||||||
@ -641,7 +640,12 @@ static int orte_rmaps_rank_file_parse(const char *rankfile, int np)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case ORTE_RANKFILE_SLOT:
|
case ORTE_RANKFILE_SLOT:
|
||||||
rankmap[ival].slot_list = strdup(orte_rmaps_rank_file_parse_string_or_int());
|
if ( NULL == (value = orte_rmaps_rank_file_parse_string_or_int())) {
|
||||||
|
opal_show_help("help-rmaps_rank_file.txt", "bad-syntax", true, rankfile);
|
||||||
|
rc = ORTE_ERR_BAD_PARAM;
|
||||||
|
goto unlock;
|
||||||
|
}
|
||||||
|
rankmap[ival].slot_list = strdup(value);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user