The ability to add procs to a running job was unfortunately borked when we added the detection of a proc exiting before calling init. Re-enable it here, ensuring that procs that are being restarted and/or added to a job do -not- call barrier during orte_init.
This commit was SVN r22404.
Этот коммит содержится в:
родитель
370b1c75c4
Коммит
cec840f6b9
@ -224,7 +224,7 @@ int orte_ess_base_app_setup(void)
|
||||
* Cannot do this on a restart as the rest of the processes
|
||||
* in the job won't be executing this step, so we would hang
|
||||
*/
|
||||
if (0 == orte_process_info.num_restarts && ORTE_PROC_IS_NON_MPI) {
|
||||
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte barrier";
|
||||
|
@ -896,7 +896,7 @@ find_my_procs:
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list - checking proc %s on daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(j),
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc),
|
||||
ORTE_VPID_PRINT(host_daemon)));
|
||||
|
||||
/* does this proc belong to us? */
|
||||
@ -904,7 +904,7 @@ find_my_procs:
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:constructing child list - found proc %s for me!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(j)));
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc)));
|
||||
|
||||
add_child = true;
|
||||
/* if this job is restarting procs, then we need to treat things
|
||||
@ -927,6 +927,7 @@ find_my_procs:
|
||||
(child->alive) ? "ALIVE" : "DEAD"));
|
||||
add_child = false;
|
||||
child->restarts = restarts[j];
|
||||
child->do_not_barrier = true;
|
||||
/* mark that this app_context is being used on this node */
|
||||
jobdat->apps[app_idx[j]]->used_on_node = true;
|
||||
break;
|
||||
@ -936,6 +937,9 @@ find_my_procs:
|
||||
|
||||
/* if we need to add the child, do so */
|
||||
if (add_child) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"adding proc %s to my local list",
|
||||
ORTE_NAME_PRINT(&proc)));
|
||||
/* keep tabs of the number of local procs */
|
||||
jobdat->num_local_procs++;
|
||||
/* add this proc to our child list */
|
||||
@ -947,7 +951,11 @@ find_my_procs:
|
||||
}
|
||||
child->app_idx = app_idx[j]; /* save the index into the app_context objects */
|
||||
child->restarts = restarts[j];
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
/* if the job is in restart mode, the child must not barrier when launched */
|
||||
if (ORTE_JOB_STATE_RESTART == jobdat->state) {
|
||||
child->do_not_barrier = true;
|
||||
}
|
||||
if (NULL != slot_str && NULL != slot_str[j]) {
|
||||
child->slot_list = strdup(slot_str[j]);
|
||||
}
|
||||
/* mark that this app_context is being used on this node */
|
||||
@ -1556,6 +1564,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch working child %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* does this child belong to this app? */
|
||||
if (i != child->app_idx) {
|
||||
continue;
|
||||
@ -1789,6 +1802,17 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* if the proc should not barrier in orte_init, tell it */
|
||||
if (child->do_not_barrier || 0 < child->restarts) {
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_setenv(param, "1", true, &app->env);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if the proc isn't going to forward IO, then we need to flag that
|
||||
* it has "completed" iof termination as otherwise it will never fire
|
||||
*/
|
||||
|
@ -92,6 +92,7 @@ static void orte_odls_child_constructor(orte_odls_child_t *ptr)
|
||||
ptr->slot_list = NULL;
|
||||
ptr->waitpid_recvd = false;
|
||||
ptr->iof_complete = false;
|
||||
ptr->do_not_barrier = false;
|
||||
}
|
||||
static void orte_odls_child_destructor(orte_odls_child_t *ptr)
|
||||
{
|
||||
|
@ -99,6 +99,7 @@ typedef struct {
|
||||
bool waitpid_recvd; /* waitpid has detected proc termination */
|
||||
bool iof_complete; /* IOF has noted proc terminating all channels */
|
||||
struct timeval starttime; /* when the proc was started - for timing purposes only */
|
||||
bool do_not_barrier; /* the proc should not barrier in orte_init */
|
||||
} orte_odls_child_t;
|
||||
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_odls_child_t);
|
||||
|
||||
|
@ -174,6 +174,9 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
* so check to see if any nodes are in the map - this will be our
|
||||
* indicator that this is the prior map for a failed job that
|
||||
* needs to be re-mapped
|
||||
*
|
||||
* NOTE: if a proc is being ADDED to an existing job, then its
|
||||
* node field will be NULL.
|
||||
*/
|
||||
if (0 < jdata->map->num_nodes) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
@ -197,7 +200,8 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: proc %s from node %s is to be restarted",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), proc->node->name));
|
||||
ORTE_NAME_PRINT(&proc->name),
|
||||
(NULL == proc->node) ? "NULL" : proc->node->name));
|
||||
/* if we have fault groups, flag all the fault groups that
|
||||
* include this node so we don't reuse them
|
||||
*/
|
||||
@ -214,7 +218,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(&ftgrp->nodes, k))) {
|
||||
continue;
|
||||
}
|
||||
if (0 == strcmp(node->name, proc->node->name)) {
|
||||
if (NULL != proc->node && 0 == strcmp(node->name, proc->node->name)) {
|
||||
/* yes - mark it to not be included */
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output,
|
||||
"%s rmaps:resilient: node %s is in fault group %d, which will be excluded",
|
||||
@ -315,7 +319,9 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata)
|
||||
"%s rmaps:resilient: placing proc %s into fault group %d node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc->name), target->ftgrp, nd->name));
|
||||
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
|
||||
if (NULL != proc->node) {
|
||||
OBJ_RELEASE(proc->node); /* required to maintain bookkeeping */
|
||||
}
|
||||
/* put proc on the found node */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx,
|
||||
NULL, jdata->map->oversubscribe, false, &proc))) {
|
||||
|
@ -159,6 +159,9 @@ char *orte_report_events_uri = NULL;
|
||||
/* report bindings */
|
||||
bool orte_report_bindings = false;
|
||||
|
||||
/* barrier control */
|
||||
bool orte_do_not_barrier = false;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
|
@ -634,6 +634,9 @@ ORTE_DECLSPEC extern char *orte_report_events_uri;
|
||||
/* report bindings */
|
||||
ORTE_DECLSPEC extern bool orte_report_bindings;
|
||||
|
||||
/* barrier control */
|
||||
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -417,6 +417,13 @@ int orte_register_params(void)
|
||||
orte_report_events = true;
|
||||
}
|
||||
|
||||
/* barrier control */
|
||||
mca_base_param_reg_int_name("orte", "do_not_barrier",
|
||||
"Do not barrier in orte_init",
|
||||
true, false,
|
||||
(int) false, &value);
|
||||
orte_do_not_barrier = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user