Merge pull request #1602 from rhc54/topic/psm
Enable PSM to support dynamic processes
Этот коммит содержится в:
Коммит
9c496f767b
@ -70,6 +70,7 @@
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
@ -272,6 +273,9 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
|
||||
int i;
|
||||
orte_app_context_t *app;
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
char *key;
|
||||
orte_job_t *parent;
|
||||
orte_process_name_t name, *nptr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:setup_job",
|
||||
@ -308,6 +312,50 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
|
||||
ORTE_FLAG_SET(caddy->jdata, ORTE_JOB_FLAG_RECOVERABLE);
|
||||
}
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them. If
|
||||
* this is a dynamic spawn, then use the same keys as the
|
||||
* parent process had so the new/old procs can communicate.
|
||||
* Otherwise we can use the jobfam and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
nptr = &name;
|
||||
if (orte_get_attribute(&caddy->jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&nptr, OPAL_NAME)) {
|
||||
/* get the parent jdata */
|
||||
if (NULL == (parent = orte_get_job_data_object(name.jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
key = NULL;
|
||||
if (!orte_get_attribute(&parent->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&key, OPAL_STRING) ||
|
||||
NULL == key) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* record it */
|
||||
orte_set_attribute(&caddy->jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, key, OPAL_STRING);
|
||||
/* add the transport key envar to each app */
|
||||
for (i=0; i < caddy->jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &app->env);
|
||||
}
|
||||
free(key);
|
||||
} else {
|
||||
/* this will also record the transport key attribute in the job object, and
|
||||
* adds the key envar to each app */
|
||||
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* if app recovery is not defined, set apps to defaults */
|
||||
for (i=0; i < caddy->jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(caddy->jdata->apps, i))) {
|
||||
|
@ -127,7 +127,7 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
orte_exit_code_t exit_code;
|
||||
int32_t rc=ORTE_SUCCESS, ret;
|
||||
orte_app_context_t *app, *child_app;
|
||||
orte_process_name_t name;
|
||||
orte_process_name_t name, *nptr;
|
||||
pid_t pid;
|
||||
bool running;
|
||||
int i, room;
|
||||
@ -162,8 +162,17 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
jdata->originator.jobid = sender->jobid;
|
||||
jdata->originator.vpid = sender->vpid;
|
||||
|
||||
/* get the name of the actual spawn parent - i.e., the proc that actually
|
||||
* requested the spawn */
|
||||
nptr = &name;
|
||||
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&nptr, OPAL_NAME)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
rc = ORTE_ERR_NOT_FOUND;
|
||||
goto ANSWER_LAUNCH;
|
||||
}
|
||||
|
||||
/* get the parent's job object */
|
||||
if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
|
||||
if (NULL != (parent = orte_get_job_data_object(name.jobid))) {
|
||||
/* if the prefix was set in the parent's job, we need to transfer
|
||||
* that prefix to the child's app_context so any further launch of
|
||||
* orteds can find the correct binary. There always has to be at
|
||||
|
@ -1010,14 +1010,6 @@ int orte_submit_job(char *argv[], int *index,
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* pre-condition any network transports that require it */
|
||||
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
orte_show_help("help-orterun.txt", "orterun:precondition", false,
|
||||
orte_basename, NULL, NULL, rc);
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
return rc;
|
||||
}
|
||||
/* setup for debugging */
|
||||
orte_debugger_init_before_spawn(jdata);
|
||||
|
||||
|
@ -187,21 +187,21 @@ int orterun(int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// wait for response and unpack the status, jobid
|
||||
while (orte_event_base_active && launchst.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Job %s has launched",
|
||||
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
|
||||
}
|
||||
if (!orte_event_base_active || ORTE_SUCCESS != launchst.status) {
|
||||
goto DONE;
|
||||
}
|
||||
// wait for response and unpack the status, jobid
|
||||
while (orte_event_base_active && launchst.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
if (orte_debug_flag) {
|
||||
opal_output(0, "Job %s has launched",
|
||||
(NULL == launchst.jdata) ? "UNKNOWN" : ORTE_JOBID_PRINT(launchst.jdata->jobid));
|
||||
}
|
||||
if (!orte_event_base_active || ORTE_SUCCESS != launchst.status) {
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
while (orte_event_base_active && completest.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
while (orte_event_base_active && completest.active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* ensure all local procs are dead */
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
@ -279,6 +280,8 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
|
||||
return "ORTE_JOB_MULTI_DAEMON_SIM";
|
||||
case ORTE_JOB_NOTIFY_COMPLETION:
|
||||
return "ORTE_JOB_NOTIFY_COMPLETION";
|
||||
case ORTE_JOB_TRANSPORT_KEY:
|
||||
return "ORTE_JOB_TRANSPORT_KEY";
|
||||
|
||||
case ORTE_PROC_NOBARRIER:
|
||||
return "PROC-NOBARRIER";
|
||||
|
@ -139,6 +139,7 @@ typedef uint16_t orte_job_flags_t;
|
||||
#define ORTE_JOB_TIMESTAMP_OUTPUT (ORTE_JOB_START_KEY + 48) // bool - timestamp stdout/stderr
|
||||
#define ORTE_JOB_MULTI_DAEMON_SIM (ORTE_JOB_START_KEY + 49) // bool - multiple daemons/node to simulate large cluster
|
||||
#define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates
|
||||
#define ORTE_JOB_TRANSPORT_KEY (ORTE_JOB_START_KEY + 51) // string - transport keys assigned to this job
|
||||
|
||||
#define ORTE_JOB_MAX_KEY 300
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -43,6 +44,7 @@
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/attr.h"
|
||||
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
|
||||
@ -161,6 +163,9 @@ int orte_pre_condition_transports(orte_job_t *jdata)
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* record it in case this job executes a dynamic spawn */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, ORTE_ATTR_LOCAL, string_key, OPAL_STRING);
|
||||
|
||||
if (OPAL_SUCCESS != mca_base_var_env_name ("orte_precondition_transports", &cs_env)) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(string_key);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user