Fix tool connection logic so we properly search for default session server, perform specified number of retries, etc.
Signed-off-by: Ralph Castain <rhc@open-mpi.org> (cherry picked from commit 7c755e01004f8b86c71f1729662979ea45ab1adb)
Этот коммит содержится в:
родитель
16de607607
Коммит
e575c4d6f9
@ -125,6 +125,7 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
||||
char myhost[PMIX_MAXHOSTNAMELEN];
|
||||
bool system_level = false;
|
||||
bool system_level_only = false;
|
||||
pid_t pid = 0;
|
||||
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"ptl:tcp: connecting to server");
|
||||
@ -224,12 +225,17 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
||||
system_level = info[n].value.data.flag;
|
||||
}
|
||||
} else if (0 == strcmp(info[n].key, PMIX_SERVER_PIDINFO)) {
|
||||
mca_ptl_tcp_component.tool_pid = info[n].value.data.pid;
|
||||
pid = info[n].value.data.pid;
|
||||
pmix_output(0, "GOT PID %d", (int)pid);
|
||||
} else if (0 == strcmp(info[n].key, PMIX_SERVER_URI)) {
|
||||
if (NULL == mca_ptl_tcp_component.super.uri) {
|
||||
free(mca_ptl_tcp_component.super.uri);
|
||||
}
|
||||
mca_ptl_tcp_component.super.uri = strdup(info[n].value.data.string);
|
||||
} else if (0 == strcmp(info[n].key, PMIX_CONNECT_RETRY_DELAY)) {
|
||||
mca_ptl_tcp_component.wait_to_connect = info[n].value.data.uint32;
|
||||
} else if (0 == strcmp(info[n].key, PMIX_CONNECT_MAX_RETRIES)) {
|
||||
mca_ptl_tcp_component.max_retries = info[n].value.data.uint32;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -263,6 +269,29 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
||||
goto complete;
|
||||
}
|
||||
|
||||
/* if they gave us a pid, then look for it */
|
||||
if (0 != pid) {
|
||||
if (0 > asprintf(&filename, "pmix.%s.tool.%d", myhost, pid)) {
|
||||
return PMIX_ERR_NOMEM;
|
||||
}
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"ptl:tcp:tool searching for given session server %s",
|
||||
filename);
|
||||
nspace = NULL;
|
||||
rc = df_search(mca_ptl_tcp_component.system_tmpdir,
|
||||
filename, &sd, &nspace, &rank);
|
||||
free(filename);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
goto complete;
|
||||
}
|
||||
if (NULL != nspace) {
|
||||
free(nspace);
|
||||
}
|
||||
/* since they gave us a specific pid and we couldn't
|
||||
* connect to it, return an error */
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
|
||||
/* if they asked for system-level, we start there */
|
||||
if (system_level || system_level_only) {
|
||||
@ -297,31 +326,6 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
/* now try the session-level connection - if they gave us a pid, then
|
||||
* look for it */
|
||||
if (0 != mca_ptl_tcp_component.tool_pid) {
|
||||
if (0 > asprintf(&filename, "pmix.%s.tool.%d",
|
||||
myhost, mca_ptl_tcp_component.tool_pid)) {
|
||||
return PMIX_ERR_NOMEM;
|
||||
}
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"ptl:tcp:tool searching for given session server %s",
|
||||
filename);
|
||||
nspace = NULL;
|
||||
rc = df_search(mca_ptl_tcp_component.system_tmpdir,
|
||||
filename, &sd, &nspace, &rank);
|
||||
free(filename);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
goto complete;
|
||||
}
|
||||
if (NULL != nspace) {
|
||||
free(nspace);
|
||||
}
|
||||
/* since they gave us a specific pid and we couldn't
|
||||
* connect to it, return an error */
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
/* they didn't give us a pid, so we will search to see what session-level
|
||||
* tools are available to this user. We will take the first connection
|
||||
* that succeeds - this is based on the likelihood that there is only
|
||||
@ -441,6 +445,11 @@ static pmix_status_t send_oneway(struct pmix_peer_t *peer,
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
static void timeout(int sd, short args, void *cbdata)
|
||||
{
|
||||
pmix_lock_t *lock = (pmix_lock_t*)cbdata;
|
||||
PMIX_WAKEUP_THREAD(lock);
|
||||
}
|
||||
|
||||
/**** SUPPORTING FUNCTIONS ****/
|
||||
static pmix_status_t parse_uri_file(char *filename,
|
||||
@ -450,14 +459,48 @@ static pmix_status_t parse_uri_file(char *filename,
|
||||
{
|
||||
FILE *fp;
|
||||
char *srvr, *p, *p2;
|
||||
pmix_lock_t lock;
|
||||
pmix_event_t ev;
|
||||
struct timeval tv;
|
||||
int retries;
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
/* if we cannot open the file, then the server must not
|
||||
* be configured to support tool connections, or this
|
||||
* user isn't authorized to access it */
|
||||
* user isn't authorized to access it - or it may just
|
||||
* not exist yet! Check for existence */
|
||||
if (0 != access(filename, R_OK)) {
|
||||
if (ENOENT == errno) {
|
||||
/* the file does not exist, so give it
|
||||
* a little time to see if the server
|
||||
* is still starting up */
|
||||
retries = 0;
|
||||
do {
|
||||
++retries;
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"WAITING FOR CONNECTION FILE");
|
||||
PMIX_CONSTRUCT_LOCK(&lock);
|
||||
tv.tv_sec = mca_ptl_tcp_component.wait_to_connect;
|
||||
tv.tv_usec = 0;
|
||||
pmix_event_evtimer_set(pmix_globals.evbase, &ev,
|
||||
timeout, &lock);
|
||||
pmix_event_evtimer_add(&ev, &tv);
|
||||
PMIX_WAIT_THREAD(&lock);
|
||||
PMIX_DESTRUCT_LOCK(&lock);
|
||||
fp = fopen(filename, "r");
|
||||
if (NULL != fp) {
|
||||
/* we found it! */
|
||||
goto process;
|
||||
}
|
||||
} while (retries < mca_ptl_tcp_component.max_retries);
|
||||
/* otherwise, mark it as unreachable */
|
||||
}
|
||||
}
|
||||
return PMIX_ERR_UNREACH;
|
||||
}
|
||||
|
||||
process:
|
||||
/* get the URI */
|
||||
srvr = pmix_getline(fp);
|
||||
if (NULL == srvr) {
|
||||
@ -916,6 +959,7 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
||||
char *suri, *nsp, *newdir;
|
||||
pmix_rank_t rk;
|
||||
pmix_status_t rc;
|
||||
struct stat buf;
|
||||
DIR *cur_dirp;
|
||||
struct dirent *dir_entry;
|
||||
|
||||
@ -933,9 +977,12 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
||||
0 == strcmp(dir_entry->d_name, "..")) {
|
||||
continue;
|
||||
}
|
||||
/* if it is a directory, down search */
|
||||
if (DT_DIR == dir_entry->d_type) {
|
||||
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
||||
if (-1 == stat(newdir, &buf)) {
|
||||
continue;
|
||||
}
|
||||
/* if it is a directory, down search */
|
||||
if (S_ISDIR(buf.st_mode)) {
|
||||
rc = df_search(newdir, prefix, sd, nspace, rank);
|
||||
free(newdir);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
@ -944,22 +991,14 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* if it isn't a regular file, ignore it */
|
||||
if (DT_REG != dir_entry->d_type) {
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"pmix:tcp: ignoring %s", dir_entry->d_name);
|
||||
continue;
|
||||
}
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"pmix:tcp: checking %s vs %s", dir_entry->d_name, prefix);
|
||||
/* see if it starts with our prefix */
|
||||
if (0 == strncmp(dir_entry->d_name, prefix, strlen(prefix))) {
|
||||
/* try to read this file */
|
||||
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||
"pmix:tcp: reading file %s", newdir);
|
||||
rc = parse_uri_file(newdir, &suri, &nsp, &rk);
|
||||
free(newdir);
|
||||
if (PMIX_SUCCESS == rc) {
|
||||
if (NULL != mca_ptl_tcp_component.super.uri) {
|
||||
free(mca_ptl_tcp_component.super.uri);
|
||||
@ -972,11 +1011,13 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
||||
(*nspace) = nsp;
|
||||
*rank = rk;
|
||||
closedir(cur_dirp);
|
||||
free(newdir);
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
free(nsp);
|
||||
}
|
||||
}
|
||||
free(newdir);
|
||||
}
|
||||
closedir(cur_dirp);
|
||||
return PMIX_ERR_NOT_FOUND;
|
||||
|
@ -47,7 +47,8 @@ typedef struct {
|
||||
struct sockaddr_storage connection;
|
||||
char *session_filename;
|
||||
char *system_filename;
|
||||
pid_t tool_pid;
|
||||
int wait_to_connect;
|
||||
int max_retries;
|
||||
} pmix_ptl_tcp_component_t;
|
||||
|
||||
extern pmix_ptl_tcp_component_t mca_ptl_tcp_component;
|
||||
|
@ -113,7 +113,8 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo,
|
||||
.disable_ipv6_family = true,
|
||||
.session_filename = NULL,
|
||||
.system_filename = NULL,
|
||||
.tool_pid = 0
|
||||
.wait_to_connect = 4,
|
||||
.max_retries = 2
|
||||
};
|
||||
|
||||
static char **split_and_resolve(char **orig_str, char *name);
|
||||
@ -132,13 +133,6 @@ static int component_register(void)
|
||||
PMIX_MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_ptl_tcp_component.super.uri);
|
||||
|
||||
(void)pmix_mca_base_component_var_register(component, "tool_pid",
|
||||
"pid of a tool we are to connect to",
|
||||
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
PMIX_INFO_LVL_2,
|
||||
PMIX_MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_ptl_tcp_component.tool_pid);
|
||||
|
||||
(void)pmix_mca_base_component_var_register(component, "if_include",
|
||||
"Comma-delimited list of devices and/or CIDR notation of TCP networks (e.g., \"eth0,192.168.0.0/16\"). Mutually exclusive with ptl_tcp_if_exclude.",
|
||||
PMIX_MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
@ -192,6 +186,20 @@ static int component_register(void)
|
||||
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ptl_tcp_component.disable_ipv6_family);
|
||||
|
||||
(void)pmix_mca_base_component_var_register(component, "connection_wait_time",
|
||||
"Number of seconds to wait for the server connection file to appear",
|
||||
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
PMIX_INFO_LVL_4,
|
||||
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ptl_tcp_component.wait_to_connect);
|
||||
|
||||
(void)pmix_mca_base_component_var_register(component, "max_retries",
|
||||
"Number of times to look for the connection file before quitting",
|
||||
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
PMIX_INFO_LVL_4,
|
||||
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ptl_tcp_component.max_retries);
|
||||
|
||||
return PMIX_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -111,7 +111,7 @@ static int rte_init(void)
|
||||
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* otherwise, if I am a tool proc, use that procedure */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_tool_setup";
|
||||
goto fn_fail;
|
||||
|
@ -65,7 +65,7 @@ ORTE_DECLSPEC int orte_ess_base_app_setup(bool db_restrict_local);
|
||||
ORTE_DECLSPEC int orte_ess_base_app_finalize(void);
|
||||
ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report);
|
||||
|
||||
ORTE_DECLSPEC int orte_ess_base_tool_setup(uint8_t flags);
|
||||
ORTE_DECLSPEC int orte_ess_base_tool_setup(opal_list_t *flags);
|
||||
ORTE_DECLSPEC int orte_ess_base_tool_finalize(void);
|
||||
|
||||
ORTE_DECLSPEC int orte_ess_base_orted_setup(void);
|
||||
|
@ -90,7 +90,7 @@ static void infocb(int status,
|
||||
OPAL_PMIX_WAKEUP_THREAD(lock);
|
||||
}
|
||||
|
||||
int orte_ess_base_tool_setup(uint8_t flags)
|
||||
int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
@ -98,7 +98,7 @@ int orte_ess_base_tool_setup(uint8_t flags)
|
||||
orte_jobid_t jobid;
|
||||
orte_vpid_t vpid;
|
||||
opal_list_t info;
|
||||
opal_value_t *kv, val;
|
||||
opal_value_t *kv, *knext, val;
|
||||
opal_pmix_query_t *q;
|
||||
opal_pmix_lock_t lock;
|
||||
opal_buffer_t *buf;
|
||||
@ -181,27 +181,13 @@ int orte_ess_base_tool_setup(uint8_t flags)
|
||||
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||
kv->type = OPAL_VPID;
|
||||
opal_list_append(&info, &kv->super);
|
||||
if (0 != flags) {
|
||||
/* instruct the PMIx layer on if/how to connect */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
if (0x01 == flags) {
|
||||
kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT);
|
||||
} else if (0x02 == flags) {
|
||||
kv->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST);
|
||||
} else if (0x04 == flags) {
|
||||
kv->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM);
|
||||
} else {
|
||||
opal_output(0, "UNKNOWN CONNECTION FLAG %0x", flags);
|
||||
error = "unknown connection flags";
|
||||
ret = ORTE_ERR_BAD_PARAM;
|
||||
OPAL_LIST_DESTRUCT(&info);
|
||||
OBJ_RELEASE(kv);
|
||||
goto error;
|
||||
}
|
||||
kv->data.flag = true;
|
||||
kv->type = OPAL_BOOL;
|
||||
if (NULL != flags) {
|
||||
/* pass along any directives */
|
||||
OPAL_LIST_FOREACH_SAFE(kv, knext, flags, opal_value_t) {
|
||||
opal_list_remove_item(flags, &kv->super);
|
||||
opal_list_append(&info, &kv->super);
|
||||
}
|
||||
}
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "opal_pmix.init";
|
||||
|
@ -92,7 +92,7 @@ static int rte_init(void)
|
||||
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* otherwise, if I am a tool proc, use that procedure */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_tool_setup";
|
||||
goto error;
|
||||
|
@ -87,7 +87,7 @@ static int rte_init(void)
|
||||
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* otherwise, if I am a tool proc, use that procedure */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_tool_setup";
|
||||
goto error;
|
||||
|
@ -91,7 +91,7 @@ static int rte_init(void)
|
||||
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* otherwise, if I am a tool proc, use that procedure */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_tool_setup";
|
||||
goto error;
|
||||
|
@ -35,6 +35,9 @@ typedef struct {
|
||||
bool system_server_first;
|
||||
bool system_server_only;
|
||||
bool do_not_connect;
|
||||
int wait_to_connect;
|
||||
int num_retries;
|
||||
int pid;
|
||||
} orte_ess_tool_component_t;
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_ess_tool_component_t mca_ess_tool_component;
|
||||
|
@ -63,21 +63,24 @@ orte_ess_tool_component_t mca_ess_tool_component = {
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
},
|
||||
.async = false
|
||||
.async = false,
|
||||
.system_server_first = false,
|
||||
.system_server_only = false,
|
||||
.wait_to_connect = 0,
|
||||
.num_retries = 0,
|
||||
.pid = 0
|
||||
};
|
||||
|
||||
static int tool_component_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_ess_tool_component.super.base_version;
|
||||
|
||||
mca_ess_tool_component.async = false;
|
||||
(void) mca_base_component_var_register (c, "async_progress", "Setup an async progress thread",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.async);
|
||||
|
||||
mca_ess_tool_component.do_not_connect = false;
|
||||
(void) mca_base_component_var_register (c, "do_not_connect",
|
||||
"Do not connect to a PMIx server",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
@ -85,7 +88,6 @@ static int tool_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.do_not_connect);
|
||||
|
||||
mca_ess_tool_component.system_server_first = false;
|
||||
(void) mca_base_component_var_register (c, "system_server_first",
|
||||
"Look for a system PMIx server first",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
@ -93,13 +95,33 @@ static int tool_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.system_server_first);
|
||||
|
||||
mca_ess_tool_component.system_server_only = false;
|
||||
(void) mca_base_component_var_register (c, "system_server_only",
|
||||
"Only connect to a system server (and not an mpirun)",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.system_server_only);
|
||||
|
||||
(void) mca_base_component_var_register (c, "wait_to_connect",
|
||||
"Time in seconds to wait before retrying connection to server",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.wait_to_connect);
|
||||
|
||||
(void) mca_base_component_var_register (c, "num_retries",
|
||||
"Number of times to retry connecting to server",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.num_retries);
|
||||
|
||||
(void) mca_base_component_var_register (c, "server_pid",
|
||||
"PID of the server to which we are to connect",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_2,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ess_tool_component.pid);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,7 @@
|
||||
#endif
|
||||
|
||||
#include "opal/runtime/opal_progress_threads.h"
|
||||
#include "opal/mca/pmix/pmix_types.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/plm/base/base.h"
|
||||
@ -63,7 +64,8 @@ static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
uint8_t flags;
|
||||
opal_list_t flags;
|
||||
opal_value_t *val;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
@ -81,21 +83,57 @@ static int rte_init(void)
|
||||
}
|
||||
|
||||
/* setup the tool connection flags */
|
||||
flags = 0;
|
||||
OBJ_CONSTRUCT(&flags, opal_list_t);
|
||||
if (mca_ess_tool_component.do_not_connect) {
|
||||
flags = 0x01;
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&flags, &val->super);
|
||||
} else if (mca_ess_tool_component.system_server_first) {
|
||||
flags = 0x02;
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&flags, &val->super);
|
||||
} else if (mca_ess_tool_component.system_server_only) {
|
||||
flags = 0x04;
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM);
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&flags, &val->super);
|
||||
}
|
||||
if (0 < mca_ess_tool_component.wait_to_connect) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CONNECT_RETRY_DELAY);
|
||||
val->type = OPAL_UINT32;
|
||||
val->data.uint32 = mca_ess_tool_component.wait_to_connect;
|
||||
opal_list_append(&flags, &val->super);
|
||||
}
|
||||
if (0 < mca_ess_tool_component.num_retries) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_CONNECT_MAX_RETRIES);
|
||||
val->type = OPAL_UINT32;
|
||||
val->data.uint32 = mca_ess_tool_component.num_retries;
|
||||
opal_list_append(&flags, &val->super);
|
||||
}
|
||||
if (0 < mca_ess_tool_component.pid) {
|
||||
val = OBJ_NEW(opal_value_t);
|
||||
val->key = strdup(OPAL_PMIX_SERVER_PIDINFO);
|
||||
val->type = OPAL_PID;
|
||||
val->data.pid = mca_ess_tool_component.pid;
|
||||
opal_list_append(&flags, &val->super);
|
||||
}
|
||||
|
||||
|
||||
/* do the standard tool init */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(flags))) {
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(&flags))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OPAL_LIST_DESTRUCT(&flags);
|
||||
error = "orte_ess_base_tool_setup";
|
||||
goto error;
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&flags);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
|
@ -105,6 +105,7 @@ static struct {
|
||||
bool run_as_root;
|
||||
bool set_sid;
|
||||
bool daemonize;
|
||||
bool system_server;
|
||||
} myglobals;
|
||||
|
||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
@ -165,6 +166,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"List of hosts to invoke processes on" },
|
||||
|
||||
{ NULL, '\0', "system-server", "system-server", 0,
|
||||
&myglobals.system_server, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Provide a system-level server connection point - only one allowed per node" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -280,10 +285,12 @@ int main(int argc, char *argv[])
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (myglobals.system_server) {
|
||||
/* we should act as system-level PMIx server */
|
||||
opal_setenv("OMPI_MCA_pmix_system_server", "1", true, &environ);
|
||||
/* and as session-level PMIx server */
|
||||
opal_setenv("OMPI_MCA_pmix_session_server", "1", true, &environ);
|
||||
opal_setenv(OPAL_MCA_PREFIX"pmix_system_server", "1", true, &environ);
|
||||
}
|
||||
/* always act as session-level PMIx server */
|
||||
opal_setenv(OPAL_MCA_PREFIX"pmix_session_server", "1", true, &environ);
|
||||
|
||||
/* Setup MCA params */
|
||||
orte_register_params();
|
||||
|
@ -139,6 +139,9 @@ struct orte_cmd_options_t {
|
||||
int timeout;
|
||||
bool report_state_on_timeout;
|
||||
bool get_stack_traces;
|
||||
int pid;
|
||||
bool system_server_only;
|
||||
bool system_server_first;
|
||||
};
|
||||
typedef struct orte_cmd_options_t orte_cmd_options_t;
|
||||
static orte_cmd_options_t orte_cmd_options = {0};
|
||||
@ -471,6 +474,22 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* look first for a system server */
|
||||
{ NULL, '\0', "system-server-first", "system-server-first", 0,
|
||||
&orte_cmd_options.system_server_first, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"First look for a system server and connect to it if found", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* connect only to a system server */
|
||||
{ NULL, '\0', "system-server-only", "system-server-only", 0,
|
||||
&orte_cmd_options.system_server_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Connect only to a system-level server", OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* provide a connection PID */
|
||||
{ NULL, '\0', "pid", "pid", 1,
|
||||
&orte_cmd_options.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||
"PID of the session-level daemon to which we should connect",
|
||||
OPAL_CMD_LINE_OTYPE_DVM },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -647,9 +666,38 @@ int prun(int argc, char *argv[])
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* tell the ess/tool component that we want to connect to a system-level
|
||||
/* Check for help request */
|
||||
if (orte_cmd_options.help) {
|
||||
char *str, *args = NULL;
|
||||
args = opal_cmd_line_get_usage_msg(orte_cmd_line);
|
||||
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
|
||||
"prun", "PSVR", OPAL_VERSION,
|
||||
"prun", args,
|
||||
PACKAGE_BUGREPORT);
|
||||
if (NULL != str) {
|
||||
printf("%s", str);
|
||||
free(str);
|
||||
}
|
||||
free(args);
|
||||
|
||||
/* If someone asks for help, that should be all we do */
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* tell the ess/tool component that we want to connect only to a system-level
|
||||
* PMIx server */
|
||||
opal_setenv("OMPI_MCA_ess_tool_system_server_only", "1", true, &environ);
|
||||
if (orte_cmd_options.system_server_only) {
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_only", "1", true, &environ);
|
||||
}
|
||||
if (orte_cmd_options.system_server_first) {
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_first", "1", true, &environ);
|
||||
}
|
||||
/* if they specified the DVM's pid, then pass it along */
|
||||
if (0 != orte_cmd_options.pid) {
|
||||
asprintf(¶m, "%d", orte_cmd_options.pid);
|
||||
opal_setenv(OPAL_MCA_PREFIX"ess_tool_server_pid", param, true, &environ);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* now initialize ORTE */
|
||||
if (OPAL_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
||||
@ -665,7 +713,6 @@ int prun(int argc, char *argv[])
|
||||
val->type = OPAL_BOOL;
|
||||
val->data.flag = true;
|
||||
opal_list_append(&info, &val->super);
|
||||
|
||||
fprintf(stderr, "TERMINATING DVM...");
|
||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||
rc = opal_pmix.job_control(NULL, &info, infocb, (void*)&lock);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user