1
1

Merge pull request #2043 from rhc54/topic/notifycomplete

Implement notification of completion on comm_spawn'd child jobs.
Этот коммит содержится в:
rhc54 2016-09-01 16:42:30 -05:00 коммит произвёл GitHub
родитель 43b2e3a844 0ea1cff733
Коммит fde6e6c6f8
9 изменённых файлов: 108 добавлений и 5 удалений

Просмотреть файл

@ -89,7 +89,12 @@ enum {
OPAL_ERR_PROC_REQUESTED_ABORT = (OPAL_ERR_BASE - 58),
OPAL_ERR_PROC_ABORTING = (OPAL_ERR_BASE - 59),
OPAL_ERR_NODE_DOWN = (OPAL_ERR_BASE - 60),
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61)
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61),
OPAL_ERR_JOB_TERMINATED = (OPAL_ERR_BASE - 62),
OPAL_ERR_PROC_RESTART = (OPAL_ERR_BASE - 63),
OPAL_ERR_PROC_CHECKPOINT = (OPAL_ERR_BASE - 64),
OPAL_ERR_PROC_MIGRATE = (OPAL_ERR_BASE - 65),
OPAL_ERR_EVENT_REGISTRATION = (OPAL_ERR_BASE - 66)
};
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

Просмотреть файл

@ -28,7 +28,7 @@
AC_DEFUN([MCA_opal_pmix_pmix3x_CONFIG],[
AC_CONFIG_FILES([opal/mca/pmix/pmix3x/Makefile])
OPAL_VAR_SCOPE_PUSH([PMIX_VERSION opal_pmix_pmix3x_save_CPPFLAGS opal_pmix_pmix3x_save_LDFLAGS opal_pmix_pmix3x_save_LIBS opal_pmix_pmix3x_basedir opal_pmix_pmix3x_save_cflags])
OPAL_VAR_SCOPE_PUSH([PMIX_VERSION opal_pmix_pmix3x_save_CPPFLAGS opal_pmix_pmix3x_save_LDFLAGS opal_pmix_pmix3x_save_LIBS opal_pmix_pmix3x_basedir opal_pmix_pmix3x_save_cflags opal_pmix_pmix3x_sm_flag])
opal_pmix_pmix3x_basedir=opal/mca/pmix/pmix3x
@ -37,7 +37,19 @@ AC_DEFUN([MCA_opal_pmix_pmix3x_CONFIG],[
opal_pmix_pmix3x_save_LDFLAGS=$LDFLAGS
opal_pmix_pmix3x_save_LIBS=$LIBS
opal_pmix_pmix3x_args="--without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-hwloc --with-hwloc-header=\\\"$opal_hwloc_base_include\\\""
AC_ARG_ENABLE([pmix3-dstore],
[AC_HELP_STRING([--enable-pmix3-dstore],
[Enable PMIx shared memory data store (default: disabled)])])
AC_MSG_CHECKING([if PMIx3 shared memory data store is enabled])
if test "$enable_pmix3_dstore" = "yes"; then
AC_MSG_RESULT([yes])
opal_pmix_pmix3x_sm_flag=--enable-dstore
else
AC_MSG_RESULT([no (disabled)])
opal_pmix_pmix3x_sm_flag=--disable-dstore
fi
opal_pmix_pmix3x_args="$opal_pmix_pmix3x_sm_flag --without-tests-examples --disable-visibility --enable-embedded-libevent --with-libevent-header=\\\"opal/mca/event/$opal_event_base_include\\\" --enable-embedded-hwloc --with-hwloc-header=\\\"$opal_hwloc_base_include\\\""
AS_IF([test "$enable_debug" = "yes"],
[opal_pmix_pmix3x_args="--enable-debug $opal_pmix_pmix3x_args"
CFLAGS="$OPAL_CFLAGS_BEFORE_PICKY $OPAL_VISIBILITY_CFLAGS -g"],

Просмотреть файл

@ -256,7 +256,8 @@ typedef uint32_t pmix_rank_t;
#define PMIX_QUERY_LOCAL_PROC_TABLE "pmix.qry.lptable" // (char*) input nspace of job whose info is being requested
// returns (pmix_data_array_t) an array of pmix_proc_info_t for
// procs in job on same node
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform"
#define PMIX_QUERY_AUTHORIZATIONS "pmix.qry.auths" // return operations tool is authorized to perform
#define PMIX_QUERY_MEMPROFILE "pmix.qry.mempfle" // (pmix_usage_t) return an array of memory usage stats
/* log attributes */
#define PMIX_LOG_STDERR "pmix.log.stderr" // (bool) log data to stderr
@ -558,6 +559,29 @@ typedef struct pmix_proc_info {
} while (0)
/**** PMIX USAGE STRUCT ****/
typedef struct pmix_usage {
pmix_proc_t proc;
char key[PMIX_MAX_KEYLEN+1];
float usage;
} pmix_usage_t;
#define PMIX_USAGE_CREATE(m, n) \
do { \
(m) = (pmix_usage_t*)calloc((n) , sizeof(pmix_usage_t)); \
} while (0)
#define PMIX_USAGE_RELEASE(m) PMIX_USAGE_FREE((m))
#define PMIX_USAGE_CONSTRUCT(m) \
do { \
memset((m), 0, sizeof(pmix_usage_t)); \
} while (0)
#define PMIX_USAGE_DESTRUCT(m)
#define PMIX_USAGE_FREE(m, n) free((m))
/**** PMIX VALUE STRUCT ****/
typedef struct pmix_data_array {
pmix_data_type_t type;

Просмотреть файл

@ -474,6 +474,21 @@ pmix_status_t pmix3x_convert_opalrc(int rc)
case OPAL_ERR_NODE_OFFLINE:
return PMIX_ERR_NODE_OFFLINE;
case OPAL_ERR_JOB_TERMINATED:
return PMIX_ERR_JOB_TERMINATED;
case OPAL_ERR_PROC_RESTART:
return PMIX_ERR_PROC_RESTART;
case OPAL_ERR_PROC_CHECKPOINT:
return PMIX_ERR_PROC_CHECKPOINT;
case OPAL_ERR_PROC_MIGRATE:
return PMIX_ERR_PROC_MIGRATE;
case OPAL_ERR_EVENT_REGISTRATION:
return PMIX_ERR_EVENT_REGISTRATION;
case OPAL_ERR_NOT_IMPLEMENTED:
case OPAL_ERR_NOT_SUPPORTED:
return PMIX_ERR_NOT_SUPPORTED;
@ -540,6 +555,21 @@ int pmix3x_convert_rc(pmix_status_t rc)
case PMIX_ERR_NODE_OFFLINE:
return OPAL_ERR_NODE_OFFLINE;
case PMIX_ERR_JOB_TERMINATED:
return OPAL_ERR_JOB_TERMINATED;
case PMIX_ERR_PROC_RESTART:
return OPAL_ERR_PROC_RESTART;
case PMIX_ERR_PROC_CHECKPOINT:
return OPAL_ERR_PROC_CHECKPOINT;
case PMIX_ERR_PROC_MIGRATE:
return OPAL_ERR_PROC_MIGRATE;
case PMIX_ERR_EVENT_REGISTRATION:
return OPAL_ERR_EVENT_REGISTRATION;
case PMIX_ERR_NOT_SUPPORTED:
return OPAL_ERR_NOT_SUPPORTED;

Просмотреть файл

@ -277,6 +277,21 @@ opal_err2str(int errnum, const char **errmsg)
case OPAL_ERR_NODE_OFFLINE:
retval = "Node has gone offline";
break;
case OPAL_ERR_JOB_TERMINATED:
retval = "Job terminated";
break;
case OPAL_ERR_PROC_RESTART:
retval = "Process restarted";
break;
case OPAL_ERR_PROC_CHECKPOINT:
retval = "Process checkpoint";
break;
case OPAL_ERR_PROC_MIGRATE:
retval = "Process migrate";
break;
case OPAL_ERR_EVENT_REGISTRATION:
retval = "Event registration";
break;
default:
retval = "UNRECOGNIZED";
}

Просмотреть файл

@ -521,6 +521,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_process_name_t *proc = &caddy->name;
orte_process_name_t wildcard_rank;
orte_proc_state_t state = caddy->proc_state;
orte_job_t *jdata;
orte_proc_t *pdata;
@ -628,6 +629,12 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
jdata->num_terminated++;
if (jdata->num_terminated == jdata->num_procs) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
/* if they requested notification upon completion, provide it */
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
wildcard_rank.jobid = jdata->jobid;
wildcard_rank.vpid = ORTE_VPID_WILDCARD;
_send_notification(OPAL_ERR_JOB_TERMINATED, &wildcard_rank);
}
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
!orte_job_term_ordered) {
/* if this was an abnormal term, notify the other procs of the termination */

Просмотреть файл

@ -240,6 +240,11 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
} else {
jdata->stdin_target = strtoul(info->data.string, NULL, 10);
}
} else if (0 == strcmp(info->key, OPAL_PMIX_NOTIFY_COMPLETION)) {
if (OPAL_UNDEF == info->type || info->data.flag) {
orte_set_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION,
ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
}
} else {
/* unrecognized key */
orte_show_help("help-orted.txt", "bad-key",

Просмотреть файл

@ -275,6 +275,10 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
return "ORTE-JOB-TAG-OUTPUT";
case ORTE_JOB_TIMESTAMP_OUTPUT:
return "ORTE-JOB-TIMESTAMP-OUTPUT";
case ORTE_JOB_MULTI_DAEMON_SIM:
return "ORTE_JOB_MULTI_DAEMON_SIM";
case ORTE_JOB_NOTIFY_COMPLETION:
return "ORTE_JOB_NOTIFY_COMPLETION";
case ORTE_PROC_NOBARRIER:
return "PROC-NOBARRIER";

Просмотреть файл

@ -138,6 +138,7 @@ typedef uint16_t orte_job_flags_t;
#define ORTE_JOB_TAG_OUTPUT (ORTE_JOB_START_KEY + 47) // bool - tag stdout/stderr
#define ORTE_JOB_TIMESTAMP_OUTPUT (ORTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr
#define ORTE_JOB_MULTI_DAEMON_SIM (ORTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster
#define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates
#define ORTE_JOB_MAX_KEY 300