Clean up an error in r24371 - can't use a const parameter as target in asprintf as it changes the value of the address.
Add some new proc/job states Rename a constant to reflect coming change - remove the arbitrary difference between restarting a proc locally and relocating it to another node in terms of the number of restarts allowed. Add pretty-print of signals for "proc aborted due to signal" reports. This commit was SVN r24378. The following SVN revision numbers were found above: r24371 --> open-mpi/ompi@93d28a5792
Этот коммит содержится в:
родитель
e8c2519280
Коммит
b5de068533
@ -67,7 +67,7 @@ opal_strerror_int(int errnum, const char **str)
|
||||
|
||||
/* caller must free string */
|
||||
static int
|
||||
opal_strerror_unknown(int errnum, const char **str)
|
||||
opal_strerror_unknown(int errnum, char **str)
|
||||
{
|
||||
int i;
|
||||
*str = NULL;
|
||||
@ -105,7 +105,7 @@ opal_perror(int errnum, const char *msg)
|
||||
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
||||
perror(msg);
|
||||
} else {
|
||||
const char *ue_msg;
|
||||
char *ue_msg;
|
||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||
fprintf(stderr, "%s\n", ue_msg);
|
||||
free(ue_msg);
|
||||
@ -134,7 +134,7 @@ opal_strerror(int errnum)
|
||||
ret = opal_strerror_int(errnum, &errmsg);
|
||||
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
const char *ue_msg;
|
||||
char *ue_msg;
|
||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||
snprintf(unknown_retbuf, UNKNOWN_RETBUF_LEN, "%s", ue_msg);
|
||||
free(ue_msg);
|
||||
@ -159,7 +159,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen)
|
||||
strncpy(strerrbuf, tmp, buflen);
|
||||
return OPAL_SUCCESS;
|
||||
} else {
|
||||
const char *ue_msg;
|
||||
char *ue_msg;
|
||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||
len = snprintf(strerrbuf, buflen, "%s", ue_msg);
|
||||
free(ue_msg);
|
||||
|
@ -111,7 +111,7 @@ enum {
|
||||
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
|
||||
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
|
||||
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
|
||||
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
||||
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
||||
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
|
||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
|
||||
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35),
|
||||
|
@ -1584,7 +1584,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
pdata->relocates++;
|
||||
/* have we exceeded the number of relocates for this proc? */
|
||||
if (app->max_global_restarts < pdata->relocates) {
|
||||
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
|
||||
return ORTE_ERR_RESTART_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
/* reset the job params for restart */
|
||||
|
@ -44,6 +44,7 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */
|
||||
#define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */
|
||||
#define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */
|
||||
#define ORTE_PROC_STATE_TERMINATE 0x00000008 /* process is marked for termination */
|
||||
#define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */
|
||||
#define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */
|
||||
/*
|
||||
@ -64,6 +65,7 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
|
||||
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
||||
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
|
||||
|
||||
/*
|
||||
* Job state codes
|
||||
@ -96,6 +98,7 @@ typedef uint32_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
|
||||
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
||||
|
||||
/* the job never even attempted to launch due to an error earlier in the
|
||||
* launch procedure
|
||||
@ -128,6 +131,8 @@ orte_node_state_t) */
|
||||
#define ORTE_NODE_STATE_REBOOT 3
|
||||
/** Node is up, but not available for use for the next mapping */
|
||||
#define ORTE_NODE_STATE_DO_NOT_USE 4
|
||||
/** Node is up, but not part of the node pool for jobs */
|
||||
#define ORTE_NODE_STATE_NOT_INCLUDED 5
|
||||
|
||||
/*
|
||||
* PLM commands
|
||||
|
@ -23,6 +23,13 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_SYS_SIGNAL_H
|
||||
#include <sys/signal.h>
|
||||
#else
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
@ -132,8 +139,8 @@ int orte_err2str(int errnum, const char **errmsg)
|
||||
case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP:
|
||||
retval = "System will determine resources during bootstrap of daemons";
|
||||
break;
|
||||
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
|
||||
retval = "Limit on number of process relocations was exceeded";
|
||||
case ORTE_ERR_RESTART_LIMIT_EXCEEDED:
|
||||
retval = "Limit on number of process restarts was exceeded";
|
||||
break;
|
||||
case ORTE_ERR_UNRECOVERABLE:
|
||||
retval = "Unrecoverable error";
|
||||
@ -202,6 +209,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "ABORT IN PROGRESS";
|
||||
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
||||
return "HEARTBEAT FAILED";
|
||||
case ORTE_JOB_STATE_PROCS_MIGRATING:
|
||||
return "PROCS MIGRATING";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
@ -244,8 +253,80 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||
return "HEARTBEAT FAILED";
|
||||
break;
|
||||
case ORTE_PROC_STATE_MIGRATING:
|
||||
return "MIGRATING";
|
||||
case ORTE_PROC_STATE_CANNOT_RESTART:
|
||||
return "CANNOT BE RESTARTED";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
}
|
||||
|
||||
const char *orte_proc_exit_code_to_signal(int exit_code)
|
||||
{
|
||||
int signal;
|
||||
|
||||
signal = exit_code - 128;
|
||||
|
||||
switch(signal) {
|
||||
case SIGHUP:
|
||||
return "SIGHUP";
|
||||
case SIGINT:
|
||||
return "SIGINT";
|
||||
case SIGQUIT:
|
||||
return "SIGQUIT";
|
||||
case SIGILL:
|
||||
return "SIGILL";
|
||||
case SIGTRAP:
|
||||
return "SIGTRAP";
|
||||
case SIGABRT:
|
||||
return "SIGABRT";
|
||||
case SIGFPE:
|
||||
return "SIGFPE";
|
||||
case SIGKILL:
|
||||
return "SIGKILL";
|
||||
case SIGBUS:
|
||||
return "SIGBUS";
|
||||
case SIGSEGV:
|
||||
return "SIGSEGV";
|
||||
case SIGPIPE:
|
||||
return "SIGPIPE";
|
||||
case SIGALRM:
|
||||
return "SIGALRM";
|
||||
case SIGTERM:
|
||||
return "SIGTERM";
|
||||
case SIGURG:
|
||||
return "SIGURG";
|
||||
case SIGSTOP:
|
||||
return "SIGSTOP";
|
||||
case SIGTSTP:
|
||||
return "SIGTSTP";
|
||||
case SIGCONT:
|
||||
return "SIGCONT";
|
||||
case SIGCHLD:
|
||||
return "SIGCHLD";
|
||||
case SIGTTIN:
|
||||
return "SIGTTIN";
|
||||
case SIGTTOU:
|
||||
return "SIGTTOU";
|
||||
case SIGIO:
|
||||
return "SIGIO";
|
||||
case SIGXCPU:
|
||||
return "SIGXCPU";
|
||||
case SIGXFSZ:
|
||||
return "SIGXFSZ";
|
||||
case SIGVTALRM:
|
||||
return "SIGVTALRM";
|
||||
case SIGPROF:
|
||||
return "SIGPROF";
|
||||
case SIGWINCH:
|
||||
return "SIGWINCH";
|
||||
case SIGUSR1:
|
||||
return "SIGUSR1";
|
||||
case SIGUSR2:
|
||||
return "SIGUSR2";
|
||||
default:
|
||||
return "UNRECOGNIZED";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -39,6 +39,7 @@ ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state);
|
||||
|
||||
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
|
||||
|
||||
ORTE_DECLSPEC const char *orte_proc_exit_code_to_signal(int exit_code);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user