Fix comm_spawn in oversubscribed conditions. IF oversubscription is allowed, let nodes flow into the mapper even if they are oversubscribed, constrained by the slots_max absolute ceiling. Cleanup error messages when comm_spawn fails so it correctly and succintly reports the ereror.
This commit was SVN r25659.
Этот коммит содержится в:
родитель
71352453af
Коммит
2dd2694f25
@ -391,6 +391,7 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
failed_job = jdata->jobid;
|
||||
@ -435,9 +436,15 @@ static int plm_alps_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -255,7 +255,8 @@ static void process_msg(int fd, short event, void *data)
|
||||
OPAL_RELEASE_THREAD(&lock, &cond, &processing);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto ANSWER_LAUNCH;
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
goto DEPART;
|
||||
}
|
||||
OPAL_ACQUIRE_THREAD(&lock, &cond, &processing);
|
||||
|
||||
|
@ -516,6 +516,7 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
failed_job = jdata->jobid;
|
||||
@ -575,9 +576,15 @@ static int plm_ccp_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and process if so */
|
||||
|
@ -345,6 +345,7 @@ launch_apps:
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
/* daemons succeeded - any failure now would be from apps */
|
||||
@ -388,9 +389,15 @@ cleanup:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -1162,6 +1162,7 @@ static int rsh_launch(orte_job_t *jdata)
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
failed_job = jdata->jobid;
|
||||
@ -1205,9 +1206,15 @@ static int rsh_launch(orte_job_t *jdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, job_state,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -405,6 +405,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
failed_job = jdata->jobid;
|
||||
@ -449,9 +450,15 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -413,6 +413,7 @@ launch_apps:
|
||||
/* setup the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
failed_job = jdata->jobid;
|
||||
goto cleanup;
|
||||
}
|
||||
/* since the daemons have launched, any failures now will be for the
|
||||
@ -458,9 +459,15 @@ launch_apps:
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
if (ORTE_ERR_SILENT == rc) {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_SILENT_ABORT,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
} else {
|
||||
orte_errmgr.update_state(failed_job, ORTE_JOB_STATE_FAILED_TO_START,
|
||||
NULL, ORTE_PROC_STATE_UNDEF,
|
||||
0, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
|
||||
|
@ -70,6 +70,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
/* a map has not been defined yet for this job, so set one
|
||||
* up here
|
||||
*/
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: creating new map for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* create a map object where we will store the results */
|
||||
map = OBJ_NEW(orte_job_map_t);
|
||||
if (NULL == map) {
|
||||
@ -90,6 +94,10 @@ int orte_rmaps_base_map_job(orte_job_t *jdata)
|
||||
/* assign the map object to this job */
|
||||
jdata->map = map;
|
||||
} else {
|
||||
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
|
||||
"mca:rmaps: setting mapping policies for job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
if (!jdata->map->display_map) {
|
||||
jdata->map->display_map = orte_rmaps_base.display_map;
|
||||
}
|
||||
|
@ -134,7 +134,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
*total_num_slots = 0;
|
||||
|
||||
/* if the hnp was allocated, include it unless flagged not to */
|
||||
if (orte_hnp_is_allocated && !(policy & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
|
||||
if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
|
||||
/* clear this for future use, but don't include it */
|
||||
@ -264,12 +264,18 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
|
||||
if (0 != node->slots_max && node->slots_inuse > node->slots_max) {
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
} else { /** otherwise, add the slots for our job to the total */
|
||||
if (0 == node->slots_alloc) {
|
||||
} else if (node->slots_alloc <= node->slots_inuse &&
|
||||
(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
|
||||
/* remove the node as fully used */
|
||||
opal_list_remove_item(allocated_nodes, item);
|
||||
OBJ_RELEASE(item); /* "un-retain" it */
|
||||
} else {
|
||||
if (node->slots_alloc > node->slots_inuse) {
|
||||
/* add the available slots */
|
||||
num_slots += node->slots_alloc - node->slots_inuse;
|
||||
} else {
|
||||
/* always allocate at least one */
|
||||
num_slots++;
|
||||
} else if (node->slots_alloc > node->slots_inuse) {
|
||||
num_slots += node->slots_alloc - node->slots_inuse;
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user