1
1

Clean up an error in r24371 - can't use a const parameter as target in asprintf as it changes the value of the address.

Add some new proc/job states

Rename a constant to reflect coming change - remove the arbitrary difference between restarting a proc locally and relocating it to another node in terms of the number of restarts allowed.

Add pretty-print of signals for "proc aborted due to signal" reports.

This commit was SVN r24378.

The following SVN revision numbers were found above:
  r24371 --> open-mpi/ompi@93d28a5792
Этот коммит содержится в:
Ralph Castain 2011-02-14 19:29:09 +00:00
родитель e8c2519280
Коммит b5de068533
6 изменённых файлов: 95 добавлений и 8 удалений

Просмотреть файл

@ -67,7 +67,7 @@ opal_strerror_int(int errnum, const char **str)
/* caller must free string */
static int
opal_strerror_unknown(int errnum, const char **str)
opal_strerror_unknown(int errnum, char **str)
{
int i;
*str = NULL;
@ -105,7 +105,7 @@ opal_perror(int errnum, const char *msg)
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
perror(msg);
} else {
const char *ue_msg;
char *ue_msg;
ret = opal_strerror_unknown(errnum, &ue_msg);
fprintf(stderr, "%s\n", ue_msg);
free(ue_msg);
@ -134,7 +134,7 @@ opal_strerror(int errnum)
ret = opal_strerror_int(errnum, &errmsg);
if (OPAL_SUCCESS != ret) {
const char *ue_msg;
char *ue_msg;
ret = opal_strerror_unknown(errnum, &ue_msg);
snprintf(unknown_retbuf, UNKNOWN_RETBUF_LEN, "%s", ue_msg);
free(ue_msg);
@ -159,7 +159,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen)
strncpy(strerrbuf, tmp, buflen);
return OPAL_SUCCESS;
} else {
const char *ue_msg;
char *ue_msg;
ret = opal_strerror_unknown(errnum, &ue_msg);
len = snprintf(strerrbuf, buflen, "%s", ue_msg);
free(ue_msg);

Просмотреть файл

@ -111,7 +111,7 @@ enum {
ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29),
ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30),
ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31),
ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32),
ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33),
ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34),
ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35),

Просмотреть файл

@ -1584,7 +1584,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (app->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
return ORTE_ERR_RESTART_LIMIT_EXCEEDED;
}
/* reset the job params for restart */

Просмотреть файл

@ -44,6 +44,7 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */
#define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */
#define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */
#define ORTE_PROC_STATE_TERMINATE 0x00000008 /* process is marked for termination */
#define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */
#define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */
/*
@ -64,6 +65,7 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
/*
* Job state codes
@ -96,6 +98,7 @@ typedef uint32_t orte_job_state_t;
#define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure
@ -128,6 +131,8 @@ orte_node_state_t) */
#define ORTE_NODE_STATE_REBOOT 3
/** Node is up, but not available for use for the next mapping */
#define ORTE_NODE_STATE_DO_NOT_USE 4
/** Node is up, but not part of the node pool for jobs */
#define ORTE_NODE_STATE_NOT_INCLUDED 5
/*
* PLM commands

Просмотреть файл

@ -23,6 +23,13 @@
#include "orte/constants.h"
#include <stdio.h>
#ifdef HAVE_SYS_SIGNAL_H
#include <sys/signal.h>
#else
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#endif
#include "opal/util/opal_sos.h"
#include "orte/util/error_strings.h"
@ -132,8 +139,8 @@ int orte_err2str(int errnum, const char **errmsg)
case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP:
retval = "System will determine resources during bootstrap of daemons";
break;
case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED:
retval = "Limit on number of process relocations was exceeded";
case ORTE_ERR_RESTART_LIMIT_EXCEEDED:
retval = "Limit on number of process restarts was exceeded";
break;
case ORTE_ERR_UNRECOVERABLE:
retval = "Unrecoverable error";
@ -202,6 +209,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "ABORT IN PROGRESS";
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
return "HEARTBEAT FAILED";
case ORTE_JOB_STATE_PROCS_MIGRATING:
return "PROCS MIGRATING";
default:
return "UNKNOWN STATE!";
}
@ -244,8 +253,80 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
case ORTE_PROC_STATE_HEARTBEAT_FAILED:
return "HEARTBEAT FAILED";
break;
case ORTE_PROC_STATE_MIGRATING:
return "MIGRATING";
case ORTE_PROC_STATE_CANNOT_RESTART:
return "CANNOT BE RESTARTED";
default:
return "UNKNOWN STATE!";
}
}
const char *orte_proc_exit_code_to_signal(int exit_code)
{
int signal;
signal = exit_code - 128;
switch(signal) {
case SIGHUP:
return "SIGHUP";
case SIGINT:
return "SIGINT";
case SIGQUIT:
return "SIGQUIT";
case SIGILL:
return "SIGILL";
case SIGTRAP:
return "SIGTRAP";
case SIGABRT:
return "SIGABRT";
case SIGFPE:
return "SIGFPE";
case SIGKILL:
return "SIGKILL";
case SIGBUS:
return "SIGBUS";
case SIGSEGV:
return "SIGSEGV";
case SIGPIPE:
return "SIGPIPE";
case SIGALRM:
return "SIGALRM";
case SIGTERM:
return "SIGTERM";
case SIGURG:
return "SIGURG";
case SIGSTOP:
return "SIGSTOP";
case SIGTSTP:
return "SIGTSTP";
case SIGCONT:
return "SIGCONT";
case SIGCHLD:
return "SIGCHLD";
case SIGTTIN:
return "SIGTTIN";
case SIGTTOU:
return "SIGTTOU";
case SIGIO:
return "SIGIO";
case SIGXCPU:
return "SIGXCPU";
case SIGXFSZ:
return "SIGXFSZ";
case SIGVTALRM:
return "SIGVTALRM";
case SIGPROF:
return "SIGPROF";
case SIGWINCH:
return "SIGWINCH";
case SIGUSR1:
return "SIGUSR1";
case SIGUSR2:
return "SIGUSR2";
default:
return "UNRECOGNIZED";
}
}

Просмотреть файл

@ -39,6 +39,7 @@ ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state);
ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state);
ORTE_DECLSPEC const char *orte_proc_exit_code_to_signal(int exit_code);
END_C_DECLS
#endif