Merge pull request #2861 from jjhursey/topic/ibm/master/orted-timeout-improv
orterun: Add parameter to control when we give up on stack traces
Этот коммит содержится в:
Коммит
31faf0a950
@ -17,6 +17,7 @@
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -3071,8 +3072,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
|
||||
}
|
||||
++ntraces;
|
||||
if (orte_process_info.num_procs == ntraces) {
|
||||
/* cancel the timeout */
|
||||
OBJ_DESTRUCT(&stack_trace_timer);
|
||||
if( orte_stack_trace_wait_timeout > 0 ) {
|
||||
/* cancel the timeout */
|
||||
OBJ_DESTRUCT(&stack_trace_timer);
|
||||
}
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
|
||||
/* set the global abnormal exit flag */
|
||||
@ -3173,12 +3176,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
|
||||
OBJ_RELEASE(sig);
|
||||
/* we will terminate after we get the stack_traces, but set a timeout
|
||||
* just in case we never hear back from everyone */
|
||||
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
|
||||
opal_event_evtimer_set(orte_event_base,
|
||||
stack_trace_timer.ev, stack_trace_timeout, NULL);
|
||||
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
|
||||
stack_trace_timer.tv.tv_sec = 30;
|
||||
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
|
||||
if( orte_stack_trace_wait_timeout > 0 ) {
|
||||
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
|
||||
opal_event_evtimer_set(orte_event_base,
|
||||
stack_trace_timer.ev, stack_trace_timeout, NULL);
|
||||
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
|
||||
stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
|
||||
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
|
||||
}
|
||||
return;
|
||||
}
|
||||
giveup:
|
||||
|
@ -133,6 +133,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL;
|
||||
|
||||
opal_buffer_t *orte_tree_launch_cmd = NULL;
|
||||
|
||||
int orte_stack_trace_wait_timeout = 30;
|
||||
|
||||
/* global arrays for data storage */
|
||||
opal_hash_table_t *orte_job_data = NULL;
|
||||
opal_pointer_array_t *orte_node_pool = NULL;
|
||||
|
@ -578,6 +578,9 @@ ORTE_DECLSPEC extern char *orte_base_user_debugger;
|
||||
*/
|
||||
ORTE_DECLSPEC extern char *orte_daemon_cores;
|
||||
|
||||
/* Max time to wait for stack straces to return */
|
||||
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
|
||||
|
@ -765,5 +765,13 @@ int orte_register_params(void)
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &orte_mgmt_transport);
|
||||
|
||||
/* Amount of time to wait for a stack trace to return from the daemons */
|
||||
orte_stack_trace_wait_timeout = 30;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace",
|
||||
"Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_stack_trace_wait_timeout);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user