Merge pull request #2043 from rhc54/topic/notifycomplete
Implement notification of completion on comm_spawn'd child jobs.
Этот коммит содержится в:
Коммит
fde6e6c6f8
@ -89,7 +89,12 @@ enum {
|
||||
OPAL_ERR_PROC_REQUESTED_ABORT = (OPAL_ERR_BASE - 58),
|
||||
OPAL_ERR_PROC_ABORTING = (OPAL_ERR_BASE - 59),
|
||||
OPAL_ERR_NODE_DOWN = (OPAL_ERR_BASE - 60),
|
||||
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61)
|
||||
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61),
|
||||
OPAL_ERR_JOB_TERMINATED = (OPAL_ERR_BASE - 62),
|
||||
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
|
||||
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
|
||||
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
|
||||
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
|
||||
};
|
||||
|
||||
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)
|
||||
|
@ -28,7 +28,7 @@
|
||||
AC_DEFUN([MCA_opal_pmix_pmix3x_CONFIG],[
|
||||
AC_CONFIG_FILES([opal/mca/pmix/pmix3x/Makefile])
|
||||
|
||||
OPAL_VAR_SCOPE_PUSH([PMIX_VERSION opal_pmix_pmix3x_save_CPPFLAGS opal_pmix_pmix3x_save_LDFLAGS opal_pmix_pmix3x_save_LIBS opal_pmix_pmix3x_basedir opal_pmix_pmix3x_save_cflags])
|
||||
OPAL_VAR_SCOPE_PUSH([PMIX_VERSION opal_pmix_pmix3x_save_CPPFLAGS opal_pmix_pmix3x_save_LDFLAGS opal_pmix_pmix3x_save_LIBS opal_pmix_pmix3x_basedir opal_pmix_pmix3x_save_cflags opal_pmix_pmix3x_sm_flag])
|
||||
|
||||
opal_pmix_pmix3x_basedir=opal/mca/pmix/pmix3x
|
||||
|
||||
@ -37,7 +37,19 @@ AC_DEFUN([MCA_opal_pmix_pmix3x_CONFIG],[
|
||||
opal_pmix_pmix3x_save_LDFLAGS=$LDFLAGS
|
||||
opal_pmix_pmix3x_save_LIBS=$LIBS
|
||||
|
||||
opal_pmix_pmix3x_args="--without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-hwloc --with-hwloc-header=\\\"$opal_hwloc_base_include\\\""
|
||||
AC_ARG_ENABLE([pmix3-dstore],
|
||||
[AC_HELP_STRING([--enable-pmix3-dstore],
|
||||
[Enable PMIx shared memory data store (default: disabled)])])
|
||||
AC_MSG_CHECKING([if PMIx3 shared memory data store is enabled])
|
||||
if test "$enable_pmix3_dstore" = "yes"; then
|
||||
AC_MSG_RESULT([yes])
|
||||
opal_pmix_pmix3x_sm_flag=--enable-dstore
|
||||
else
|
||||
AC_MSG_RESULT([no (disabled)])
|
||||
opal_pmix_pmix3x_sm_flag=--disable-dstore
|
||||
fi
|
||||
|
||||
opal_pmix_pmix3x_args="$opal_pmix_pmix3x_sm_flag --without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-hwloc --with-hwloc-header=\\\"$opal_hwloc_base_include\\\""
|
||||
AS_IF([test "$enable_debug" = "yes"],
|
||||
[opal_pmix_pmix3x_args="--enable-debug $opal_pmix_pmix3x_args"
|
||||
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"],
|
||||
|
@ -256,7 +256,8 @@ typedef uint32_t pmix_rank_t;
|
||||
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
|
||||
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
|
||||
// procs in job on same node
|
||||
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform"
|
||||
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform
|
||||
#define PMIX_QUERY_MEMPROFILE "pmix.qry.mempfle" // (pmix_usage_t) return an array of memory usage stats
|
||||
|
||||
/* log attributes */
|
||||
#define PMIX_LOG_STDERR "pmix.log.stderr" // (bool) log data to stderr
|
||||
@ -558,6 +559,29 @@ typedef struct pmix_proc_info {
|
||||
} while (0)
|
||||
|
||||
|
||||
/**** PMIX USAGE STRUCT ****/
|
||||
typedef struct pmix_usage {
|
||||
pmix_proc_t proc;
|
||||
char key[PMIX_MAX_KEYLEN+1];
|
||||
float usage;
|
||||
} pmix_usage_t;
|
||||
#define PMIX_USAGE_CREATE(m, n) \
|
||||
do { \
|
||||
(m) = (pmix_usage_t*)calloc((n) , sizeof(pmix_usage_t)); \
|
||||
} while (0)
|
||||
|
||||
#define PMIX_USAGE_RELEASE(m) PMIX_USAGE_FREE((m))
|
||||
|
||||
#define PMIX_USAGE_CONSTRUCT(m) \
|
||||
do { \
|
||||
memset((m), 0, sizeof(pmix_usage_t)); \
|
||||
} while (0)
|
||||
|
||||
#define PMIX_USAGE_DESTRUCT(m)
|
||||
|
||||
#define PMIX_USAGE_FREE(m, n) free((m))
|
||||
|
||||
|
||||
/**** PMIX VALUE STRUCT ****/
|
||||
typedef struct pmix_data_array {
|
||||
pmix_data_type_t type;
|
||||
|
@ -474,6 +474,21 @@ pmix_status_t pmix3x_convert_opalrc(int rc)
|
||||
case OPAL_ERR_NODE_OFFLINE:
|
||||
return PMIX_ERR_NODE_OFFLINE;
|
||||
|
||||
case OPAL_ERR_JOB_TERMINATED:
|
||||
return PMIX_ERR_JOB_TERMINATED;
|
||||
|
||||
case OPAL_ERR_PROC_RESTART:
|
||||
return PMIX_ERR_PROC_RESTART;
|
||||
|
||||
case OPAL_ERR_PROC_CHECKPOINT:
|
||||
return PMIX_ERR_PROC_CHECKPOINT;
|
||||
|
||||
case OPAL_ERR_PROC_MIGRATE:
|
||||
return PMIX_ERR_PROC_MIGRATE;
|
||||
|
||||
case OPAL_ERR_EVENT_REGISTRATION:
|
||||
return PMIX_ERR_EVENT_REGISTRATION;
|
||||
|
||||
case OPAL_ERR_NOT_IMPLEMENTED:
|
||||
case OPAL_ERR_NOT_SUPPORTED:
|
||||
return PMIX_ERR_NOT_SUPPORTED;
|
||||
@ -540,6 +555,21 @@ int pmix3x_convert_rc(pmix_status_t rc)
|
||||
case PMIX_ERR_NODE_OFFLINE:
|
||||
return OPAL_ERR_NODE_OFFLINE;
|
||||
|
||||
case PMIX_ERR_JOB_TERMINATED:
|
||||
return OPAL_ERR_JOB_TERMINATED;
|
||||
|
||||
case PMIX_ERR_PROC_RESTART:
|
||||
return OPAL_ERR_PROC_RESTART;
|
||||
|
||||
case PMIX_ERR_PROC_CHECKPOINT:
|
||||
return OPAL_ERR_PROC_CHECKPOINT;
|
||||
|
||||
case PMIX_ERR_PROC_MIGRATE:
|
||||
return OPAL_ERR_PROC_MIGRATE;
|
||||
|
||||
case PMIX_ERR_EVENT_REGISTRATION:
|
||||
return OPAL_ERR_EVENT_REGISTRATION;
|
||||
|
||||
case PMIX_ERR_NOT_SUPPORTED:
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
|
||||
|
@ -277,6 +277,21 @@ opal_err2str(int errnum, const char **errmsg)
|
||||
case OPAL_ERR_NODE_OFFLINE:
|
||||
retval = "Node has gone offline";
|
||||
break;
|
||||
case OPAL_ERR_JOB_TERMINATED:
|
||||
retval = "Job terminated";
|
||||
break;
|
||||
case OPAL_ERR_PROC_RESTART:
|
||||
retval = "Process restarted";
|
||||
break;
|
||||
case OPAL_ERR_PROC_CHECKPOINT:
|
||||
retval = "Process checkpoint";
|
||||
break;
|
||||
case OPAL_ERR_PROC_MIGRATE:
|
||||
retval = "Process migrate";
|
||||
break;
|
||||
case OPAL_ERR_EVENT_REGISTRATION:
|
||||
retval = "Event registration";
|
||||
break;
|
||||
default:
|
||||
retval = "UNRECOGNIZED";
|
||||
}
|
||||
|
@ -521,6 +521,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_process_name_t *proc = &caddy->name;
|
||||
orte_process_name_t wildcard_rank;
|
||||
orte_proc_state_t state = caddy->proc_state;
|
||||
orte_job_t *jdata;
|
||||
orte_proc_t *pdata;
|
||||
@ -628,6 +629,12 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
||||
jdata->num_terminated++;
|
||||
if (jdata->num_terminated == jdata->num_procs) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
/* if they requested notification upon completion, provide it */
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
|
||||
wildcard_rank.jobid = jdata->jobid;
|
||||
wildcard_rank.vpid = ORTE_VPID_WILDCARD;
|
||||
_send_notification(OPAL_ERR_JOB_TERMINATED, &wildcard_rank);
|
||||
}
|
||||
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
|
||||
!orte_job_term_ordered) {
|
||||
/* if this was an abnormal term, notify the other procs of the termination */
|
||||
@ -756,7 +763,7 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
||||
* we call the errmgr so that any attempt to restart the job will
|
||||
* avoid doing so in the exact same place as the current job
|
||||
*/
|
||||
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
||||
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
|
||||
map = jdata->map;
|
||||
for (index = 0; index < map->nodes->size; index++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
|
||||
|
@ -240,6 +240,11 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
|
||||
} else {
|
||||
jdata->stdin_target = strtoul(info->data.string, NULL, 10);
|
||||
}
|
||||
} else if (0 == strcmp(info->key, OPAL_PMIX_NOTIFY_COMPLETION)) {
|
||||
if (OPAL_UNDEF == info->type || info->data.flag) {
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION,
|
||||
ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
|
||||
}
|
||||
} else {
|
||||
/* unrecognized key */
|
||||
orte_show_help("help-orted.txt", "bad-key",
|
||||
|
@ -275,6 +275,10 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
|
||||
return "ORTE-JOB-TAG-OUTPUT";
|
||||
case ORTE_JOB_TIMESTAMP_OUTPUT:
|
||||
return "ORTE-JOB-TIMESTAMP-OUTPUT";
|
||||
case ORTE_JOB_MULTI_DAEMON_SIM:
|
||||
return "ORTE_JOB_MULTI_DAEMON_SIM";
|
||||
case ORTE_JOB_NOTIFY_COMPLETION:
|
||||
return "ORTE_JOB_NOTIFY_COMPLETION";
|
||||
|
||||
case ORTE_PROC_NOBARRIER:
|
||||
return "PROC-NOBARRIER";
|
||||
|
@ -138,6 +138,7 @@ typedef uint16_t orte_job_flags_t;
|
||||
#define ORTE_JOB_TAG_OUTPUT (ORTE_JOB_START_KEY + 47) // bool - tag stdout/stderr
|
||||
#define ORTE_JOB_TIMESTAMP_OUTPUT (ORTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr
|
||||
#define ORTE_JOB_MULTI_DAEMON_SIM (ORTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster
|
||||
#define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates
|
||||
|
||||
#define ORTE_JOB_MAX_KEY 300
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user