1
1

Fix comm_spawn in oversubscribed conditions. IF oversubscription is allowed, let nodes flow into the mapper even if they are oversubscribed, constrained by the slots_max absolute ceiling. Cleanup error messages when comm_spawn fails so it correctly and succintly reports the ereror.

This commit was SVN r25659.
Этот коммит содержится в:
Ralph Castain 2011-12-15 18:04:48 +00:00
родитель 71352453af
Коммит 2dd2694f25
9 изменённых файлов: 81 добавлений и 24 удалений

Просмотреть файл

@ -391,6 +391,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
failed_job = jdata->jobid;
@ -435,9 +436,15 @@ static int plm_alps_launch_job(orte_job_t *jdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
return rc;

Просмотреть файл

@ -255,7 +255,8 @@ static void process_msg(int fd, short event, void *data)
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
ORTE_ERROR_LOG(rc);
goto ANSWER_LAUNCH;
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
goto DEPART;
}
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);

Просмотреть файл

@ -516,6 +516,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
failed_job = jdata->jobid;
@ -575,9 +576,15 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
/* check for timing request - get stop time and process if so */

Просмотреть файл

@ -345,6 +345,7 @@ launch_apps:
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
/* daemons succeeded - any failure now would be from apps */
@ -388,9 +389,15 @@ cleanup:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
return rc;

Просмотреть файл

@ -1162,6 +1162,7 @@ static int rsh_launch(orte_job_t *jdata)
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
failed_job = jdata->jobid;
@ -1205,9 +1206,15 @@ static int rsh_launch(orte_job_t *jdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, job_state,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
return rc;

Просмотреть файл

@ -405,6 +405,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
failed_job = jdata->jobid;
@ -449,9 +450,15 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
return rc;

Просмотреть файл

@ -413,6 +413,7 @@ launch_apps:
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
failed_job = jdata->jobid;
goto cleanup;
}
/* since the daemons have launched, any failures now will be for the
@ -458,9 +459,15 @@ launch_apps:
/* check for failed launch - if so, force terminate */
if (failed_launch) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
if (ORTE_ERR_SILENT == rc) {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
} else {
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
NULL, ORTE_PROC_STATE_UNDEF,
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
}
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,

Просмотреть файл

@ -70,6 +70,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
/* a map has not been defined yet for this job, so set one
* up here
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: creating new map for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/* create a map object where we will store the results */
map = OBJ_NEW(orte_job_map_t);
if (NULL == map) {
@ -90,6 +94,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
/* assign the map object to this job */
jdata->map = map;
} else {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps: setting mapping policies for job %s",
ORTE_JOBID_PRINT(jdata->jobid));
if (!jdata->map->display_map) {
jdata->map->display_map = orte_rmaps_base.display_map;
}

Просмотреть файл

@ -134,7 +134,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
*total_num_slots = 0;
/* if the hnp was allocated, include it unless flagged not to */
if (orte_hnp_is_allocated && !(policy & ORTE_MAPPING_NO_USE_LOCAL)) {
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
/* clear this for future use, but don't include it */
@ -264,12 +264,18 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
if (0 != node->slots_max && node->slots_inuse > node->slots_max) {
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
} else { /** otherwise, add the slots for our job to the total */
if (0 == node->slots_alloc) {
} else if (node->slots_alloc <= node->slots_inuse &&
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
/* remove the node as fully used */
opal_list_remove_item(allocated_nodes, item);
OBJ_RELEASE(item); /* "un-retain" it */
} else {
if (node->slots_alloc > node->slots_inuse) {
/* add the available slots */
num_slots += node->slots_alloc - node->slots_inuse;
} else {
/* always allocate at least one */
num_slots++;
} else if (node->slots_alloc > node->slots_inuse) {
num_slots += node->slots_alloc - node->slots_inuse;
}
}