From b5de068533e114ecb08282d1ce2e86a8fa0092cb Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 14 Feb 2011 19:29:09 +0000 Subject: [PATCH] Clean up an error in r24371 - can't use a const parameter as target in asprintf as it changes the value of the address. Add some new proc/job states Rename a constant to reflect coming change - remove the arbitrary difference between restarting a proc locally and relocating it to another node in terms of the number of restarts allowed. Add pretty-print of signals for "proc aborted due to signal" reports. This commit was SVN r24378. The following SVN revision numbers were found above: r24371 --> open-mpi/ompi@93d28a57923aad6123ffc44675e4abb183524b73 --- opal/util/error.c | 8 +-- orte/include/orte/constants.h | 2 +- orte/mca/errmgr/hnp/errmgr_hnp.c | 2 +- orte/mca/plm/plm_types.h | 5 ++ orte/util/error_strings.c | 85 +++++++++++++++++++++++++++++++- orte/util/error_strings.h | 1 + 6 files changed, 95 insertions(+), 8 deletions(-) diff --git a/opal/util/error.c b/opal/util/error.c index 310609a799..f073df1c20 100644 --- a/opal/util/error.c +++ b/opal/util/error.c @@ -67,7 +67,7 @@ opal_strerror_int(int errnum, const char **str) /* caller must free string */ static int -opal_strerror_unknown(int errnum, const char **str) +opal_strerror_unknown(int errnum, char **str) { int i; *str = NULL; @@ -105,7 +105,7 @@ opal_perror(int errnum, const char *msg) if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) { perror(msg); } else { - const char *ue_msg; + char *ue_msg; ret = opal_strerror_unknown(errnum, &ue_msg); fprintf(stderr, "%s\n", ue_msg); free(ue_msg); @@ -134,7 +134,7 @@ opal_strerror(int errnum) ret = opal_strerror_int(errnum, &errmsg); if (OPAL_SUCCESS != ret) { - const char *ue_msg; + char *ue_msg; ret = opal_strerror_unknown(errnum, &ue_msg); snprintf(unknown_retbuf, UNKNOWN_RETBUF_LEN, "%s", ue_msg); free(ue_msg); @@ -159,7 +159,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen) strncpy(strerrbuf, tmp, buflen); return OPAL_SUCCESS; } else { - const char *ue_msg; + char *ue_msg; ret = opal_strerror_unknown(errnum, &ue_msg); len = snprintf(strerrbuf, buflen, "%s", ue_msg); free(ue_msg); diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 952f010385..8994dae31d 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -111,7 +111,7 @@ enum { ORTE_ERR_SYS_LIMITS_SOCKETS = (ORTE_ERR_BASE - 29), ORTE_ERR_SOCKET_NOT_AVAILABLE = (ORTE_ERR_BASE - 30), ORTE_ERR_SYSTEM_WILL_BOOTSTRAP = (ORTE_ERR_BASE - 31), - ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), + ORTE_ERR_RESTART_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33), ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34), ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35), diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 374f079bb5..f93d86db7c 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -1584,7 +1584,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, pdata->relocates++; /* have we exceeded the number of relocates for this proc? */ if (app->max_global_restarts < pdata->relocates) { - return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; + return ORTE_ERR_RESTART_LIMIT_EXCEEDED; } /* reset the job params for restart */ diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index 38a5d71c73..994c2cc2b8 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -44,6 +44,7 @@ typedef uint32_t orte_proc_state_t; #define ORTE_PROC_STATE_INIT 0x00000001 /* process entry has been created by rmaps */ #define ORTE_PROC_STATE_RESTART 0x00000002 /* the proc is ready for restart */ #define ORTE_PROC_STATE_LAUNCHED 0x00000004 /* process has been launched */ +#define ORTE_PROC_STATE_TERMINATE 0x00000008 /* process is marked for termination */ #define ORTE_PROC_STATE_RUNNING 0x00000010 /* daemon has locally fork'd process */ #define ORTE_PROC_STATE_REGISTERED 0x00000020 /* process has registered for sync */ /* @@ -64,6 +65,7 @@ typedef uint32_t orte_proc_state_t; #define ORTE_PROC_STATE_CALLED_ABORT 0x00008000 /* process called "errmgr.abort" */ #define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ #define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */ +#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */ /* * Job state codes @@ -96,6 +98,7 @@ typedef uint32_t orte_job_state_t; #define ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED 0x00004000 /* job had a process that exceeded a sensor limit */ #define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */ #define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */ +#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */ /* the job never even attempted to launch due to an error earlier in the * launch procedure @@ -128,6 +131,8 @@ orte_node_state_t) */ #define ORTE_NODE_STATE_REBOOT 3 /** Node is up, but not available for use for the next mapping */ #define ORTE_NODE_STATE_DO_NOT_USE 4 +/** Node is up, but not part of the node pool for jobs */ +#define ORTE_NODE_STATE_NOT_INCLUDED 5 /* * PLM commands diff --git a/orte/util/error_strings.c b/orte/util/error_strings.c index d1476b37fe..e9aec4d626 100644 --- a/orte/util/error_strings.c +++ b/orte/util/error_strings.c @@ -23,6 +23,13 @@ #include "orte/constants.h" #include +#ifdef HAVE_SYS_SIGNAL_H +#include +#else +#ifdef HAVE_SIGNAL_H +#include +#endif +#endif #include "opal/util/opal_sos.h" #include "orte/util/error_strings.h" @@ -132,8 +139,8 @@ int orte_err2str(int errnum, const char **errmsg) case ORTE_ERR_SYSTEM_WILL_BOOTSTRAP: retval = "System will determine resources during bootstrap of daemons"; break; - case ORTE_ERR_RELOCATE_LIMIT_EXCEEDED: - retval = "Limit on number of process relocations was exceeded"; + case ORTE_ERR_RESTART_LIMIT_EXCEEDED: + retval = "Limit on number of process restarts was exceeded"; break; case ORTE_ERR_UNRECOVERABLE: retval = "Unrecoverable error"; @@ -202,6 +209,8 @@ const char *orte_job_state_to_str(orte_job_state_t state) return "ABORT IN PROGRESS"; case ORTE_JOB_STATE_HEARTBEAT_FAILED: return "HEARTBEAT FAILED"; + case ORTE_JOB_STATE_PROCS_MIGRATING: + return "PROCS MIGRATING"; default: return "UNKNOWN STATE!"; } @@ -244,8 +253,80 @@ const char *orte_proc_state_to_str(orte_proc_state_t state) case ORTE_PROC_STATE_HEARTBEAT_FAILED: return "HEARTBEAT FAILED"; break; + case ORTE_PROC_STATE_MIGRATING: + return "MIGRATING"; + case ORTE_PROC_STATE_CANNOT_RESTART: + return "CANNOT BE RESTARTED"; default: return "UNKNOWN STATE!"; } } +const char *orte_proc_exit_code_to_signal(int exit_code) +{ + int signal; + + signal = exit_code - 128; + + switch(signal) { + case SIGHUP: + return "SIGHUP"; + case SIGINT: + return "SIGINT"; + case SIGQUIT: + return "SIGQUIT"; + case SIGILL: + return "SIGILL"; + case SIGTRAP: + return "SIGTRAP"; + case SIGABRT: + return "SIGABRT"; + case SIGFPE: + return "SIGFPE"; + case SIGKILL: + return "SIGKILL"; + case SIGBUS: + return "SIGBUS"; + case SIGSEGV: + return "SIGSEGV"; + case SIGPIPE: + return "SIGPIPE"; + case SIGALRM: + return "SIGALRM"; + case SIGTERM: + return "SIGTERM"; + case SIGURG: + return "SIGURG"; + case SIGSTOP: + return "SIGSTOP"; + case SIGTSTP: + return "SIGTSTP"; + case SIGCONT: + return "SIGCONT"; + case SIGCHLD: + return "SIGCHLD"; + case SIGTTIN: + return "SIGTTIN"; + case SIGTTOU: + return "SIGTTOU"; + case SIGIO: + return "SIGIO"; + case SIGXCPU: + return "SIGXCPU"; + case SIGXFSZ: + return "SIGXFSZ"; + case SIGVTALRM: + return "SIGVTALRM"; + case SIGPROF: + return "SIGPROF"; + case SIGWINCH: + return "SIGWINCH"; + case SIGUSR1: + return "SIGUSR1"; + case SIGUSR2: + return "SIGUSR2"; + default: + return "UNRECOGNIZED"; + } +} + diff --git a/orte/util/error_strings.h b/orte/util/error_strings.h index a0520352b2..ce8c9868ba 100644 --- a/orte/util/error_strings.h +++ b/orte/util/error_strings.h @@ -39,6 +39,7 @@ ORTE_DECLSPEC const char *orte_job_state_to_str(orte_job_state_t state); ORTE_DECLSPEC const char *orte_proc_state_to_str(orte_proc_state_t state); +ORTE_DECLSPEC const char *orte_proc_exit_code_to_signal(int exit_code); END_C_DECLS #endif