Clean up an error in r24371 - can't use a const parameter as target in asprintf as it changes the value of the address.
Add some new proc/job states Rename a constant to reflect coming change - remove the arbitrary difference between restarting a proc locally and relocating it to another node in terms of the number of restarts allowed. Add pretty-print of signals for "proc aborted due to signal" reports. This commit was SVN r24378. The following SVN revision numbers were found above: r24371 --> open-mpi/ompi@93d28a5792
Этот коммит содержится в:
родитель
e8c2519280
Коммит
b5de068533
@ -67,7 +67,7 @@ opal_strerror_int(int errnum, const char **str)
|
|||||||
|
|
||||||
/* caller must free string */
|
/* caller must free string */
|
||||||
static int
|
static int
|
||||||
opal_strerror_unknown(int errnum, const char **str)
|
opal_strerror_unknown(int errnum, char **str)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
*str = NULL;
|
*str = NULL;
|
||||||
@ -105,7 +105,7 @@ opal_perror(int errnum, const char *msg)
|
|||||||
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
||||||
perror(msg);
|
perror(msg);
|
||||||
} else {
|
} else {
|
||||||
const char *ue_msg;
|
char *ue_msg;
|
||||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||||
fprintf(stderr, "%s\n", ue_msg);
|
fprintf(stderr, "%s\n", ue_msg);
|
||||||
free(ue_msg);
|
free(ue_msg);
|
||||||
@ -134,7 +134,7 @@ opal_strerror(int errnum)
|
|||||||
ret = opal_strerror_int(errnum, &errmsg);
|
ret = opal_strerror_int(errnum, &errmsg);
|
||||||
|
|
||||||
if (OPAL_SUCCESS != ret) {
|
if (OPAL_SUCCESS != ret) {
|
||||||
const char *ue_msg;
|
char *ue_msg;
|
||||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||||
snprintf(unknown_retbuf, UNKNOWN_RETBUF_LEN, "%s", ue_msg);
|
snprintf(unknown_retbuf, UNKNOWN_RETBUF_LEN, "%s", ue_msg);
|
||||||
free(ue_msg);
|
free(ue_msg);
|
||||||
@ -159,7 +159,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen)
|
|||||||
strncpy(strerrbuf, tmp, buflen);
|
strncpy(strerrbuf, tmp, buflen);
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
} else {
|
} else {
|
||||||
const char *ue_msg;
|
char *ue_msg;
|
||||||
ret = opal_strerror_unknown(errnum, &ue_msg);
|
ret = opal_strerror_unknown(errnum, &ue_msg);
|
||||||
len = snprintf(strerrbuf, buflen, "%s", ue_msg);
|
len = snprintf(strerrbuf, buflen, "%s", ue_msg);
|
||||||
free(ue_msg);
|
free(ue_msg);
|
||||||
|
@ -111,7 +111,7 @@ enum {
|
|||||||
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
|
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
|
||||||
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
|
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
|
||||||
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
|
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
|
||||||
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
|
||||||
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
|
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
|
||||||
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
|
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
|
||||||
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35),
|
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35),
|
||||||
|
@ -1584,7 +1584,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
|||||||
pdata->relocates++;
|
pdata->relocates++;
|
||||||
/* have we exceeded the number of relocates for this proc? */
|
/* have we exceeded the number of relocates for this proc? */
|
||||||
if (app->max_global_restarts < pdata->relocates) {
|
if (app->max_global_restarts < pdata->relocates) {
|
||||||
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
|
return ORTE_ERR_RESTART_LIMIT_EXCEEDED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* reset the job params for restart */
|
/* reset the job params for restart */
|
||||||
|
@ -44,6 +44,7 @@ typedef uint32_t orte_proc_state_t;
|
|||||||
#define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */
|
#define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */
|
||||||
#define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */
|
#define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */
|
||||||
#define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */
|
#define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */
|
||||||
|
#define ORTE_PROC_STATE_TERMINATE 0x00000008 /* process is marked for termination */
|
||||||
#define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */
|
#define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */
|
||||||
#define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */
|
#define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */
|
||||||
/*
|
/*
|
||||||
@ -64,6 +65,7 @@ typedef uint32_t orte_proc_state_t;
|
|||||||
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
|
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
|
||||||
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||||
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
||||||
|
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Job state codes
|
* Job state codes
|
||||||
@ -96,6 +98,7 @@ typedef uint32_t orte_job_state_t;
|
|||||||
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
|
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
|
||||||
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
||||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||||
|
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
||||||
|
|
||||||
/* the job never even attempted to launch due to an error earlier in the
|
/* the job never even attempted to launch due to an error earlier in the
|
||||||
* launch procedure
|
* launch procedure
|
||||||
@ -128,6 +131,8 @@ orte_node_state_t) */
|
|||||||
#define ORTE_NODE_STATE_REBOOT 3
|
#define ORTE_NODE_STATE_REBOOT 3
|
||||||
/** Node is up, but not available for use for the next mapping */
|
/** Node is up, but not available for use for the next mapping */
|
||||||
#define ORTE_NODE_STATE_DO_NOT_USE 4
|
#define ORTE_NODE_STATE_DO_NOT_USE 4
|
||||||
|
/** Node is up, but not part of the node pool for jobs */
|
||||||
|
#define ORTE_NODE_STATE_NOT_INCLUDED 5
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* PLM commands
|
* PLM commands
|
||||||
|
@ -23,6 +23,13 @@
|
|||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#ifdef HAVE_SYS_SIGNAL_H
|
||||||
|
#include <sys/signal.h>
|
||||||
|
#else
|
||||||
|
#ifdef HAVE_SIGNAL_H
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "opal/util/opal_sos.h"
|
#include "opal/util/opal_sos.h"
|
||||||
#include "orte/util/error_strings.h"
|
#include "orte/util/error_strings.h"
|
||||||
@ -132,8 +139,8 @@ int orte_err2str(int errnum, const char **errmsg)
|
|||||||
case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP:
|
case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP:
|
||||||
retval = "System will determine resources during bootstrap of daemons";
|
retval = "System will determine resources during bootstrap of daemons";
|
||||||
break;
|
break;
|
||||||
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
|
case ORTE_ERR_RESTART_LIMIT_EXCEEDED:
|
||||||
retval = "Limit on number of process relocations was exceeded";
|
retval = "Limit on number of process restarts was exceeded";
|
||||||
break;
|
break;
|
||||||
case ORTE_ERR_UNRECOVERABLE:
|
case ORTE_ERR_UNRECOVERABLE:
|
||||||
retval = "Unrecoverable error";
|
retval = "Unrecoverable error";
|
||||||
@ -202,6 +209,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
|||||||
return "ABORT IN PROGRESS";
|
return "ABORT IN PROGRESS";
|
||||||
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
||||||
return "HEARTBEAT FAILED";
|
return "HEARTBEAT FAILED";
|
||||||
|
case ORTE_JOB_STATE_PROCS_MIGRATING:
|
||||||
|
return "PROCS MIGRATING";
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN STATE!";
|
return "UNKNOWN STATE!";
|
||||||
}
|
}
|
||||||
@ -244,8 +253,80 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
|||||||
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
|
||||||
return "HEARTBEAT FAILED";
|
return "HEARTBEAT FAILED";
|
||||||
break;
|
break;
|
||||||
|
case ORTE_PROC_STATE_MIGRATING:
|
||||||
|
return "MIGRATING";
|
||||||
|
case ORTE_PROC_STATE_CANNOT_RESTART:
|
||||||
|
return "CANNOT BE RESTARTED";
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN STATE!";
|
return "UNKNOWN STATE!";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char *orte_proc_exit_code_to_signal(int exit_code)
|
||||||
|
{
|
||||||
|
int signal;
|
||||||
|
|
||||||
|
signal = exit_code - 128;
|
||||||
|
|
||||||
|
switch(signal) {
|
||||||
|
case SIGHUP:
|
||||||
|
return "SIGHUP";
|
||||||
|
case SIGINT:
|
||||||
|
return "SIGINT";
|
||||||
|
case SIGQUIT:
|
||||||
|
return "SIGQUIT";
|
||||||
|
case SIGILL:
|
||||||
|
return "SIGILL";
|
||||||
|
case SIGTRAP:
|
||||||
|
return "SIGTRAP";
|
||||||
|
case SIGABRT:
|
||||||
|
return "SIGABRT";
|
||||||
|
case SIGFPE:
|
||||||
|
return "SIGFPE";
|
||||||
|
case SIGKILL:
|
||||||
|
return "SIGKILL";
|
||||||
|
case SIGBUS:
|
||||||
|
return "SIGBUS";
|
||||||
|
case SIGSEGV:
|
||||||
|
return "SIGSEGV";
|
||||||
|
case SIGPIPE:
|
||||||
|
return "SIGPIPE";
|
||||||
|
case SIGALRM:
|
||||||
|
return "SIGALRM";
|
||||||
|
case SIGTERM:
|
||||||
|
return "SIGTERM";
|
||||||
|
case SIGURG:
|
||||||
|
return "SIGURG";
|
||||||
|
case SIGSTOP:
|
||||||
|
return "SIGSTOP";
|
||||||
|
case SIGTSTP:
|
||||||
|
return "SIGTSTP";
|
||||||
|
case SIGCONT:
|
||||||
|
return "SIGCONT";
|
||||||
|
case SIGCHLD:
|
||||||
|
return "SIGCHLD";
|
||||||
|
case SIGTTIN:
|
||||||
|
return "SIGTTIN";
|
||||||
|
case SIGTTOU:
|
||||||
|
return "SIGTTOU";
|
||||||
|
case SIGIO:
|
||||||
|
return "SIGIO";
|
||||||
|
case SIGXCPU:
|
||||||
|
return "SIGXCPU";
|
||||||
|
case SIGXFSZ:
|
||||||
|
return "SIGXFSZ";
|
||||||
|
case SIGVTALRM:
|
||||||
|
return "SIGVTALRM";
|
||||||
|
case SIGPROF:
|
||||||
|
return "SIGPROF";
|
||||||
|
case SIGWINCH:
|
||||||
|
return "SIGWINCH";
|
||||||
|
case SIGUSR1:
|
||||||
|
return "SIGUSR1";
|
||||||
|
case SIGUSR2:
|
||||||
|
return "SIGUSR2";
|
||||||
|
default:
|
||||||
|
return "UNRECOGNIZED";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -39,6 +39,7 @@ ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state);
|
|||||||
|
|
||||||
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
|
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
|
||||||
|
|
||||||
|
ORTE_DECLSPEC const char *orte_proc_exit_code_to_signal(int exit_code);
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
#endif
|
#endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user