Fix tool connection logic so we properly search for default session server, perform specified number of retries, etc.
Signed-off-by: Ralph Castain <rhc@open-mpi.org> (cherry picked from commit 7c755e01004f8b86c71f1729662979ea45ab1adb)
Этот коммит содержится в:
родитель
16de607607
Коммит
e575c4d6f9
@ -125,6 +125,7 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
|||||||
char myhost[PMIX_MAXHOSTNAMELEN];
|
char myhost[PMIX_MAXHOSTNAMELEN];
|
||||||
bool system_level = false;
|
bool system_level = false;
|
||||||
bool system_level_only = false;
|
bool system_level_only = false;
|
||||||
|
pid_t pid = 0;
|
||||||
|
|
||||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||||
"ptl:tcp: connecting to server");
|
"ptl:tcp: connecting to server");
|
||||||
@ -224,12 +225,17 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
|||||||
system_level = info[n].value.data.flag;
|
system_level = info[n].value.data.flag;
|
||||||
}
|
}
|
||||||
} else if (0 == strcmp(info[n].key, PMIX_SERVER_PIDINFO)) {
|
} else if (0 == strcmp(info[n].key, PMIX_SERVER_PIDINFO)) {
|
||||||
mca_ptl_tcp_component.tool_pid = info[n].value.data.pid;
|
pid = info[n].value.data.pid;
|
||||||
|
pmix_output(0, "GOT PID %d", (int)pid);
|
||||||
} else if (0 == strcmp(info[n].key, PMIX_SERVER_URI)) {
|
} else if (0 == strcmp(info[n].key, PMIX_SERVER_URI)) {
|
||||||
if (NULL == mca_ptl_tcp_component.super.uri) {
|
if (NULL == mca_ptl_tcp_component.super.uri) {
|
||||||
free(mca_ptl_tcp_component.super.uri);
|
free(mca_ptl_tcp_component.super.uri);
|
||||||
}
|
}
|
||||||
mca_ptl_tcp_component.super.uri = strdup(info[n].value.data.string);
|
mca_ptl_tcp_component.super.uri = strdup(info[n].value.data.string);
|
||||||
|
} else if (0 == strcmp(info[n].key, PMIX_CONNECT_RETRY_DELAY)) {
|
||||||
|
mca_ptl_tcp_component.wait_to_connect = info[n].value.data.uint32;
|
||||||
|
} else if (0 == strcmp(info[n].key, PMIX_CONNECT_MAX_RETRIES)) {
|
||||||
|
mca_ptl_tcp_component.max_retries = info[n].value.data.uint32;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -263,6 +269,29 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
|||||||
goto complete;
|
goto complete;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if they gave us a pid, then look for it */
|
||||||
|
if (0 != pid) {
|
||||||
|
if (0 > asprintf(&filename, "pmix.%s.tool.%d", myhost, pid)) {
|
||||||
|
return PMIX_ERR_NOMEM;
|
||||||
|
}
|
||||||
|
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||||
|
"ptl:tcp:tool searching for given session server %s",
|
||||||
|
filename);
|
||||||
|
nspace = NULL;
|
||||||
|
rc = df_search(mca_ptl_tcp_component.system_tmpdir,
|
||||||
|
filename, &sd, &nspace, &rank);
|
||||||
|
free(filename);
|
||||||
|
if (PMIX_SUCCESS == rc) {
|
||||||
|
goto complete;
|
||||||
|
}
|
||||||
|
if (NULL != nspace) {
|
||||||
|
free(nspace);
|
||||||
|
}
|
||||||
|
/* since they gave us a specific pid and we couldn't
|
||||||
|
* connect to it, return an error */
|
||||||
|
return PMIX_ERR_UNREACH;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* if they asked for system-level, we start there */
|
/* if they asked for system-level, we start there */
|
||||||
if (system_level || system_level_only) {
|
if (system_level || system_level_only) {
|
||||||
@ -297,31 +326,6 @@ static pmix_status_t connect_to_peer(struct pmix_peer_t *peer,
|
|||||||
return PMIX_ERR_UNREACH;
|
return PMIX_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* now try the session-level connection - if they gave us a pid, then
|
|
||||||
* look for it */
|
|
||||||
if (0 != mca_ptl_tcp_component.tool_pid) {
|
|
||||||
if (0 > asprintf(&filename, "pmix.%s.tool.%d",
|
|
||||||
myhost, mca_ptl_tcp_component.tool_pid)) {
|
|
||||||
return PMIX_ERR_NOMEM;
|
|
||||||
}
|
|
||||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
|
||||||
"ptl:tcp:tool searching for given session server %s",
|
|
||||||
filename);
|
|
||||||
nspace = NULL;
|
|
||||||
rc = df_search(mca_ptl_tcp_component.system_tmpdir,
|
|
||||||
filename, &sd, &nspace, &rank);
|
|
||||||
free(filename);
|
|
||||||
if (PMIX_SUCCESS == rc) {
|
|
||||||
goto complete;
|
|
||||||
}
|
|
||||||
if (NULL != nspace) {
|
|
||||||
free(nspace);
|
|
||||||
}
|
|
||||||
/* since they gave us a specific pid and we couldn't
|
|
||||||
* connect to it, return an error */
|
|
||||||
return PMIX_ERR_UNREACH;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* they didn't give us a pid, so we will search to see what session-level
|
/* they didn't give us a pid, so we will search to see what session-level
|
||||||
* tools are available to this user. We will take the first connection
|
* tools are available to this user. We will take the first connection
|
||||||
* that succeeds - this is based on the likelihood that there is only
|
* that succeeds - this is based on the likelihood that there is only
|
||||||
@ -441,6 +445,11 @@ static pmix_status_t send_oneway(struct pmix_peer_t *peer,
|
|||||||
return PMIX_SUCCESS;
|
return PMIX_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void timeout(int sd, short args, void *cbdata)
|
||||||
|
{
|
||||||
|
pmix_lock_t *lock = (pmix_lock_t*)cbdata;
|
||||||
|
PMIX_WAKEUP_THREAD(lock);
|
||||||
|
}
|
||||||
|
|
||||||
/**** SUPPORTING FUNCTIONS ****/
|
/**** SUPPORTING FUNCTIONS ****/
|
||||||
static pmix_status_t parse_uri_file(char *filename,
|
static pmix_status_t parse_uri_file(char *filename,
|
||||||
@ -450,14 +459,48 @@ static pmix_status_t parse_uri_file(char *filename,
|
|||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
char *srvr, *p, *p2;
|
char *srvr, *p, *p2;
|
||||||
|
pmix_lock_t lock;
|
||||||
|
pmix_event_t ev;
|
||||||
|
struct timeval tv;
|
||||||
|
int retries;
|
||||||
|
|
||||||
fp = fopen(filename, "r");
|
fp = fopen(filename, "r");
|
||||||
if (NULL == fp) {
|
if (NULL == fp) {
|
||||||
/* if we cannot open the file, then the server must not
|
/* if we cannot open the file, then the server must not
|
||||||
* be configured to support tool connections, or this
|
* be configured to support tool connections, or this
|
||||||
* user isn't authorized to access it */
|
* user isn't authorized to access it - or it may just
|
||||||
|
* not exist yet! Check for existence */
|
||||||
|
if (0 != access(filename, R_OK)) {
|
||||||
|
if (ENOENT == errno) {
|
||||||
|
/* the file does not exist, so give it
|
||||||
|
* a little time to see if the server
|
||||||
|
* is still starting up */
|
||||||
|
retries = 0;
|
||||||
|
do {
|
||||||
|
++retries;
|
||||||
|
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||||
|
"WAITING FOR CONNECTION FILE");
|
||||||
|
PMIX_CONSTRUCT_LOCK(&lock);
|
||||||
|
tv.tv_sec = mca_ptl_tcp_component.wait_to_connect;
|
||||||
|
tv.tv_usec = 0;
|
||||||
|
pmix_event_evtimer_set(pmix_globals.evbase, &ev,
|
||||||
|
timeout, &lock);
|
||||||
|
pmix_event_evtimer_add(&ev, &tv);
|
||||||
|
PMIX_WAIT_THREAD(&lock);
|
||||||
|
PMIX_DESTRUCT_LOCK(&lock);
|
||||||
|
fp = fopen(filename, "r");
|
||||||
|
if (NULL != fp) {
|
||||||
|
/* we found it! */
|
||||||
|
goto process;
|
||||||
|
}
|
||||||
|
} while (retries < mca_ptl_tcp_component.max_retries);
|
||||||
|
/* otherwise, mark it as unreachable */
|
||||||
|
}
|
||||||
|
}
|
||||||
return PMIX_ERR_UNREACH;
|
return PMIX_ERR_UNREACH;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
process:
|
||||||
/* get the URI */
|
/* get the URI */
|
||||||
srvr = pmix_getline(fp);
|
srvr = pmix_getline(fp);
|
||||||
if (NULL == srvr) {
|
if (NULL == srvr) {
|
||||||
@ -916,6 +959,7 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
|||||||
char *suri, *nsp, *newdir;
|
char *suri, *nsp, *newdir;
|
||||||
pmix_rank_t rk;
|
pmix_rank_t rk;
|
||||||
pmix_status_t rc;
|
pmix_status_t rc;
|
||||||
|
struct stat buf;
|
||||||
DIR *cur_dirp;
|
DIR *cur_dirp;
|
||||||
struct dirent *dir_entry;
|
struct dirent *dir_entry;
|
||||||
|
|
||||||
@ -933,9 +977,12 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
|||||||
0 == strcmp(dir_entry->d_name, "..")) {
|
0 == strcmp(dir_entry->d_name, "..")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* if it is a directory, down search */
|
|
||||||
if (DT_DIR == dir_entry->d_type) {
|
|
||||||
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
||||||
|
if (-1 == stat(newdir, &buf)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* if it is a directory, down search */
|
||||||
|
if (S_ISDIR(buf.st_mode)) {
|
||||||
rc = df_search(newdir, prefix, sd, nspace, rank);
|
rc = df_search(newdir, prefix, sd, nspace, rank);
|
||||||
free(newdir);
|
free(newdir);
|
||||||
if (PMIX_SUCCESS == rc) {
|
if (PMIX_SUCCESS == rc) {
|
||||||
@ -944,22 +991,14 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
|||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* if it isn't a regular file, ignore it */
|
|
||||||
if (DT_REG != dir_entry->d_type) {
|
|
||||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
|
||||||
"pmix:tcp: ignoring %s", dir_entry->d_name);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||||
"pmix:tcp: checking %s vs %s", dir_entry->d_name, prefix);
|
"pmix:tcp: checking %s vs %s", dir_entry->d_name, prefix);
|
||||||
/* see if it starts with our prefix */
|
/* see if it starts with our prefix */
|
||||||
if (0 == strncmp(dir_entry->d_name, prefix, strlen(prefix))) {
|
if (0 == strncmp(dir_entry->d_name, prefix, strlen(prefix))) {
|
||||||
/* try to read this file */
|
/* try to read this file */
|
||||||
newdir = pmix_os_path(false, dirname, dir_entry->d_name, NULL);
|
|
||||||
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
pmix_output_verbose(2, pmix_ptl_base_framework.framework_output,
|
||||||
"pmix:tcp: reading file %s", newdir);
|
"pmix:tcp: reading file %s", newdir);
|
||||||
rc = parse_uri_file(newdir, &suri, &nsp, &rk);
|
rc = parse_uri_file(newdir, &suri, &nsp, &rk);
|
||||||
free(newdir);
|
|
||||||
if (PMIX_SUCCESS == rc) {
|
if (PMIX_SUCCESS == rc) {
|
||||||
if (NULL != mca_ptl_tcp_component.super.uri) {
|
if (NULL != mca_ptl_tcp_component.super.uri) {
|
||||||
free(mca_ptl_tcp_component.super.uri);
|
free(mca_ptl_tcp_component.super.uri);
|
||||||
@ -972,11 +1011,13 @@ static pmix_status_t df_search(char *dirname, char *prefix,
|
|||||||
(*nspace) = nsp;
|
(*nspace) = nsp;
|
||||||
*rank = rk;
|
*rank = rk;
|
||||||
closedir(cur_dirp);
|
closedir(cur_dirp);
|
||||||
|
free(newdir);
|
||||||
return PMIX_SUCCESS;
|
return PMIX_SUCCESS;
|
||||||
}
|
}
|
||||||
free(nsp);
|
free(nsp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
free(newdir);
|
||||||
}
|
}
|
||||||
closedir(cur_dirp);
|
closedir(cur_dirp);
|
||||||
return PMIX_ERR_NOT_FOUND;
|
return PMIX_ERR_NOT_FOUND;
|
||||||
|
@ -47,7 +47,8 @@ typedef struct {
|
|||||||
struct sockaddr_storage connection;
|
struct sockaddr_storage connection;
|
||||||
char *session_filename;
|
char *session_filename;
|
||||||
char *system_filename;
|
char *system_filename;
|
||||||
pid_t tool_pid;
|
int wait_to_connect;
|
||||||
|
int max_retries;
|
||||||
} pmix_ptl_tcp_component_t;
|
} pmix_ptl_tcp_component_t;
|
||||||
|
|
||||||
extern pmix_ptl_tcp_component_t mca_ptl_tcp_component;
|
extern pmix_ptl_tcp_component_t mca_ptl_tcp_component;
|
||||||
|
@ -113,7 +113,8 @@ static pmix_status_t setup_listener(pmix_info_t info[], size_t ninfo,
|
|||||||
.disable_ipv6_family = true,
|
.disable_ipv6_family = true,
|
||||||
.session_filename = NULL,
|
.session_filename = NULL,
|
||||||
.system_filename = NULL,
|
.system_filename = NULL,
|
||||||
.tool_pid = 0
|
.wait_to_connect = 4,
|
||||||
|
.max_retries = 2
|
||||||
};
|
};
|
||||||
|
|
||||||
static char **split_and_resolve(char **orig_str, char *name);
|
static char **split_and_resolve(char **orig_str, char *name);
|
||||||
@ -132,13 +133,6 @@ static int component_register(void)
|
|||||||
PMIX_MCA_BASE_VAR_SCOPE_LOCAL,
|
PMIX_MCA_BASE_VAR_SCOPE_LOCAL,
|
||||||
&mca_ptl_tcp_component.super.uri);
|
&mca_ptl_tcp_component.super.uri);
|
||||||
|
|
||||||
(void)pmix_mca_base_component_var_register(component, "tool_pid",
|
|
||||||
"pid of a tool we are to connect to",
|
|
||||||
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
|
||||||
PMIX_INFO_LVL_2,
|
|
||||||
PMIX_MCA_BASE_VAR_SCOPE_LOCAL,
|
|
||||||
&mca_ptl_tcp_component.tool_pid);
|
|
||||||
|
|
||||||
(void)pmix_mca_base_component_var_register(component, "if_include",
|
(void)pmix_mca_base_component_var_register(component, "if_include",
|
||||||
"Comma-delimited list of devices and/or CIDR notation of TCP networks (e.g., \"eth0,192.168.0.0/16\"). Mutually exclusive with ptl_tcp_if_exclude.",
|
"Comma-delimited list of devices and/or CIDR notation of TCP networks (e.g., \"eth0,192.168.0.0/16\"). Mutually exclusive with ptl_tcp_if_exclude.",
|
||||||
PMIX_MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
PMIX_MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||||
@ -192,6 +186,20 @@ static int component_register(void)
|
|||||||
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ptl_tcp_component.disable_ipv6_family);
|
&mca_ptl_tcp_component.disable_ipv6_family);
|
||||||
|
|
||||||
|
(void)pmix_mca_base_component_var_register(component, "connection_wait_time",
|
||||||
|
"Number of seconds to wait for the server connection file to appear",
|
||||||
|
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
PMIX_INFO_LVL_4,
|
||||||
|
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ptl_tcp_component.wait_to_connect);
|
||||||
|
|
||||||
|
(void)pmix_mca_base_component_var_register(component, "max_retries",
|
||||||
|
"Number of times to look for the connection file before quitting",
|
||||||
|
PMIX_MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
PMIX_INFO_LVL_4,
|
||||||
|
PMIX_MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ptl_tcp_component.max_retries);
|
||||||
|
|
||||||
return PMIX_SUCCESS;
|
return PMIX_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ static int rte_init(void)
|
|||||||
|
|
||||||
if (ORTE_PROC_IS_TOOL) {
|
if (ORTE_PROC_IS_TOOL) {
|
||||||
/* otherwise, if I am a tool proc, use that procedure */
|
/* otherwise, if I am a tool proc, use that procedure */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_ess_base_tool_setup";
|
error = "orte_ess_base_tool_setup";
|
||||||
goto fn_fail;
|
goto fn_fail;
|
||||||
|
@ -65,7 +65,7 @@ ORTE_DECLSPEC int orte_ess_base_app_setup(bool db_restrict_local);
|
|||||||
ORTE_DECLSPEC int orte_ess_base_app_finalize(void);
|
ORTE_DECLSPEC int orte_ess_base_app_finalize(void);
|
||||||
ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report);
|
ORTE_DECLSPEC void orte_ess_base_app_abort(int status, bool report);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_ess_base_tool_setup(uint8_t flags);
|
ORTE_DECLSPEC int orte_ess_base_tool_setup(opal_list_t *flags);
|
||||||
ORTE_DECLSPEC int orte_ess_base_tool_finalize(void);
|
ORTE_DECLSPEC int orte_ess_base_tool_finalize(void);
|
||||||
|
|
||||||
ORTE_DECLSPEC int orte_ess_base_orted_setup(void);
|
ORTE_DECLSPEC int orte_ess_base_orted_setup(void);
|
||||||
|
@ -90,7 +90,7 @@ static void infocb(int status,
|
|||||||
OPAL_PMIX_WAKEUP_THREAD(lock);
|
OPAL_PMIX_WAKEUP_THREAD(lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_ess_base_tool_setup(uint8_t flags)
|
int orte_ess_base_tool_setup(opal_list_t *flags)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
@ -98,7 +98,7 @@ int orte_ess_base_tool_setup(uint8_t flags)
|
|||||||
orte_jobid_t jobid;
|
orte_jobid_t jobid;
|
||||||
orte_vpid_t vpid;
|
orte_vpid_t vpid;
|
||||||
opal_list_t info;
|
opal_list_t info;
|
||||||
opal_value_t *kv, val;
|
opal_value_t *kv, *knext, val;
|
||||||
opal_pmix_query_t *q;
|
opal_pmix_query_t *q;
|
||||||
opal_pmix_lock_t lock;
|
opal_pmix_lock_t lock;
|
||||||
opal_buffer_t *buf;
|
opal_buffer_t *buf;
|
||||||
@ -181,27 +181,13 @@ int orte_ess_base_tool_setup(uint8_t flags)
|
|||||||
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
|
kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid;
|
||||||
kv->type = OPAL_VPID;
|
kv->type = OPAL_VPID;
|
||||||
opal_list_append(&info, &kv->super);
|
opal_list_append(&info, &kv->super);
|
||||||
if (0 != flags) {
|
if (NULL != flags) {
|
||||||
/* instruct the PMIx layer on if/how to connect */
|
/* pass along any directives */
|
||||||
kv = OBJ_NEW(opal_value_t);
|
OPAL_LIST_FOREACH_SAFE(kv, knext, flags, opal_value_t) {
|
||||||
if (0x01 == flags) {
|
opal_list_remove_item(flags, &kv->super);
|
||||||
kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT);
|
|
||||||
} else if (0x02 == flags) {
|
|
||||||
kv->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST);
|
|
||||||
} else if (0x04 == flags) {
|
|
||||||
kv->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM);
|
|
||||||
} else {
|
|
||||||
opal_output(0, "UNKNOWN CONNECTION FLAG %0x", flags);
|
|
||||||
error = "unknown connection flags";
|
|
||||||
ret = ORTE_ERR_BAD_PARAM;
|
|
||||||
OPAL_LIST_DESTRUCT(&info);
|
|
||||||
OBJ_RELEASE(kv);
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
kv->data.flag = true;
|
|
||||||
kv->type = OPAL_BOOL;
|
|
||||||
opal_list_append(&info, &kv->super);
|
opal_list_append(&info, &kv->super);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) {
|
if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "opal_pmix.init";
|
error = "opal_pmix.init";
|
||||||
|
@ -92,7 +92,7 @@ static int rte_init(void)
|
|||||||
|
|
||||||
if (ORTE_PROC_IS_TOOL) {
|
if (ORTE_PROC_IS_TOOL) {
|
||||||
/* otherwise, if I am a tool proc, use that procedure */
|
/* otherwise, if I am a tool proc, use that procedure */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_ess_base_tool_setup";
|
error = "orte_ess_base_tool_setup";
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -87,7 +87,7 @@ static int rte_init(void)
|
|||||||
|
|
||||||
if (ORTE_PROC_IS_TOOL) {
|
if (ORTE_PROC_IS_TOOL) {
|
||||||
/* otherwise, if I am a tool proc, use that procedure */
|
/* otherwise, if I am a tool proc, use that procedure */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_ess_base_tool_setup";
|
error = "orte_ess_base_tool_setup";
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -91,7 +91,7 @@ static int rte_init(void)
|
|||||||
|
|
||||||
if (ORTE_PROC_IS_TOOL) {
|
if (ORTE_PROC_IS_TOOL) {
|
||||||
/* otherwise, if I am a tool proc, use that procedure */
|
/* otherwise, if I am a tool proc, use that procedure */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(0))) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(NULL))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_ess_base_tool_setup";
|
error = "orte_ess_base_tool_setup";
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -35,6 +35,9 @@ typedef struct {
|
|||||||
bool system_server_first;
|
bool system_server_first;
|
||||||
bool system_server_only;
|
bool system_server_only;
|
||||||
bool do_not_connect;
|
bool do_not_connect;
|
||||||
|
int wait_to_connect;
|
||||||
|
int num_retries;
|
||||||
|
int pid;
|
||||||
} orte_ess_tool_component_t;
|
} orte_ess_tool_component_t;
|
||||||
|
|
||||||
ORTE_MODULE_DECLSPEC extern orte_ess_tool_component_t mca_ess_tool_component;
|
ORTE_MODULE_DECLSPEC extern orte_ess_tool_component_t mca_ess_tool_component;
|
||||||
|
@ -63,21 +63,24 @@ orte_ess_tool_component_t mca_ess_tool_component = {
|
|||||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
.async = false
|
.async = false,
|
||||||
|
.system_server_first = false,
|
||||||
|
.system_server_only = false,
|
||||||
|
.wait_to_connect = 0,
|
||||||
|
.num_retries = 0,
|
||||||
|
.pid = 0
|
||||||
};
|
};
|
||||||
|
|
||||||
static int tool_component_register(void)
|
static int tool_component_register(void)
|
||||||
{
|
{
|
||||||
mca_base_component_t *c = &mca_ess_tool_component.super.base_version;
|
mca_base_component_t *c = &mca_ess_tool_component.super.base_version;
|
||||||
|
|
||||||
mca_ess_tool_component.async = false;
|
|
||||||
(void) mca_base_component_var_register (c, "async_progress", "Setup an async progress thread",
|
(void) mca_base_component_var_register (c, "async_progress", "Setup an async progress thread",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_2,
|
OPAL_INFO_LVL_2,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ess_tool_component.async);
|
&mca_ess_tool_component.async);
|
||||||
|
|
||||||
mca_ess_tool_component.do_not_connect = false;
|
|
||||||
(void) mca_base_component_var_register (c, "do_not_connect",
|
(void) mca_base_component_var_register (c, "do_not_connect",
|
||||||
"Do not connect to a PMIx server",
|
"Do not connect to a PMIx server",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
@ -85,7 +88,6 @@ static int tool_component_register(void)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ess_tool_component.do_not_connect);
|
&mca_ess_tool_component.do_not_connect);
|
||||||
|
|
||||||
mca_ess_tool_component.system_server_first = false;
|
|
||||||
(void) mca_base_component_var_register (c, "system_server_first",
|
(void) mca_base_component_var_register (c, "system_server_first",
|
||||||
"Look for a system PMIx server first",
|
"Look for a system PMIx server first",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
@ -93,13 +95,33 @@ static int tool_component_register(void)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ess_tool_component.system_server_first);
|
&mca_ess_tool_component.system_server_first);
|
||||||
|
|
||||||
mca_ess_tool_component.system_server_only = false;
|
|
||||||
(void) mca_base_component_var_register (c, "system_server_only",
|
(void) mca_base_component_var_register (c, "system_server_only",
|
||||||
"Only connect to a system server (and not an mpirun)",
|
"Only connect to a system server (and not an mpirun)",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_2,
|
OPAL_INFO_LVL_2,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ess_tool_component.system_server_only);
|
&mca_ess_tool_component.system_server_only);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register (c, "wait_to_connect",
|
||||||
|
"Time in seconds to wait before retrying connection to server",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_2,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ess_tool_component.wait_to_connect);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register (c, "num_retries",
|
||||||
|
"Number of times to retry connecting to server",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_2,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ess_tool_component.num_retries);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register (c, "server_pid",
|
||||||
|
"PID of the server to which we are to connect",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_2,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ess_tool_component.pid);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "opal/runtime/opal_progress_threads.h"
|
#include "opal/runtime/opal_progress_threads.h"
|
||||||
|
#include "opal/mca/pmix/pmix_types.h"
|
||||||
|
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/mca/plm/base/base.h"
|
#include "orte/mca/plm/base/base.h"
|
||||||
@ -63,7 +64,8 @@ static int rte_init(void)
|
|||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
char *error = NULL;
|
char *error = NULL;
|
||||||
uint8_t flags;
|
opal_list_t flags;
|
||||||
|
opal_value_t *val;
|
||||||
|
|
||||||
/* run the prolog */
|
/* run the prolog */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||||
@ -81,21 +83,57 @@ static int rte_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* setup the tool connection flags */
|
/* setup the tool connection flags */
|
||||||
flags = 0;
|
OBJ_CONSTRUCT(&flags, opal_list_t);
|
||||||
if (mca_ess_tool_component.do_not_connect) {
|
if (mca_ess_tool_component.do_not_connect) {
|
||||||
flags = 0x01;
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT);
|
||||||
|
val->type = OPAL_BOOL;
|
||||||
|
val->data.flag = true;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
} else if (mca_ess_tool_component.system_server_first) {
|
} else if (mca_ess_tool_component.system_server_first) {
|
||||||
flags = 0x02;
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_CONNECT_SYSTEM_FIRST);
|
||||||
|
val->type = OPAL_BOOL;
|
||||||
|
val->data.flag = true;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
} else if (mca_ess_tool_component.system_server_only) {
|
} else if (mca_ess_tool_component.system_server_only) {
|
||||||
flags = 0x04;
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_CONNECT_TO_SYSTEM);
|
||||||
|
val->type = OPAL_BOOL;
|
||||||
|
val->data.flag = true;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
|
}
|
||||||
|
if (0 < mca_ess_tool_component.wait_to_connect) {
|
||||||
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_CONNECT_RETRY_DELAY);
|
||||||
|
val->type = OPAL_UINT32;
|
||||||
|
val->data.uint32 = mca_ess_tool_component.wait_to_connect;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
|
}
|
||||||
|
if (0 < mca_ess_tool_component.num_retries) {
|
||||||
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_CONNECT_MAX_RETRIES);
|
||||||
|
val->type = OPAL_UINT32;
|
||||||
|
val->data.uint32 = mca_ess_tool_component.num_retries;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
|
}
|
||||||
|
if (0 < mca_ess_tool_component.pid) {
|
||||||
|
val = OBJ_NEW(opal_value_t);
|
||||||
|
val->key = strdup(OPAL_PMIX_SERVER_PIDINFO);
|
||||||
|
val->type = OPAL_PID;
|
||||||
|
val->data.pid = mca_ess_tool_component.pid;
|
||||||
|
opal_list_append(&flags, &val->super);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* do the standard tool init */
|
/* do the standard tool init */
|
||||||
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(flags))) {
|
if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup(&flags))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
|
OPAL_LIST_DESTRUCT(&flags);
|
||||||
error = "orte_ess_base_tool_setup";
|
error = "orte_ess_base_tool_setup";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
OPAL_LIST_DESTRUCT(&flags);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
|
@ -105,6 +105,7 @@ static struct {
|
|||||||
bool run_as_root;
|
bool run_as_root;
|
||||||
bool set_sid;
|
bool set_sid;
|
||||||
bool daemonize;
|
bool daemonize;
|
||||||
|
bool system_server;
|
||||||
} myglobals;
|
} myglobals;
|
||||||
|
|
||||||
static opal_cmd_line_init_t cmd_line_init[] = {
|
static opal_cmd_line_init_t cmd_line_init[] = {
|
||||||
@ -165,6 +166,10 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
"List of hosts to invoke processes on" },
|
"List of hosts to invoke processes on" },
|
||||||
|
|
||||||
|
{ NULL, '\0', "system-server", "system-server", 0,
|
||||||
|
&myglobals.system_server, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Provide a system-level server connection point - only one allowed per node" },
|
||||||
|
|
||||||
/* End of list */
|
/* End of list */
|
||||||
{ NULL, '\0', NULL, NULL, 0,
|
{ NULL, '\0', NULL, NULL, 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
@ -280,10 +285,12 @@ int main(int argc, char *argv[])
|
|||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (myglobals.system_server) {
|
||||||
/* we should act as system-level PMIx server */
|
/* we should act as system-level PMIx server */
|
||||||
opal_setenv("OMPI_MCA_pmix_system_server", "1", true, &environ);
|
opal_setenv(OPAL_MCA_PREFIX"pmix_system_server", "1", true, &environ);
|
||||||
/* and as session-level PMIx server */
|
}
|
||||||
opal_setenv("OMPI_MCA_pmix_session_server", "1", true, &environ);
|
/* always act as session-level PMIx server */
|
||||||
|
opal_setenv(OPAL_MCA_PREFIX"pmix_session_server", "1", true, &environ);
|
||||||
|
|
||||||
/* Setup MCA params */
|
/* Setup MCA params */
|
||||||
orte_register_params();
|
orte_register_params();
|
||||||
|
@ -139,6 +139,9 @@ struct orte_cmd_options_t {
|
|||||||
int timeout;
|
int timeout;
|
||||||
bool report_state_on_timeout;
|
bool report_state_on_timeout;
|
||||||
bool get_stack_traces;
|
bool get_stack_traces;
|
||||||
|
int pid;
|
||||||
|
bool system_server_only;
|
||||||
|
bool system_server_first;
|
||||||
};
|
};
|
||||||
typedef struct orte_cmd_options_t orte_cmd_options_t;
|
typedef struct orte_cmd_options_t orte_cmd_options_t;
|
||||||
static orte_cmd_options_t orte_cmd_options = {0};
|
static orte_cmd_options_t orte_cmd_options = {0};
|
||||||
@ -471,6 +474,22 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
&orte_cmd_options.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
"Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM },
|
"Terminate the DVM", OPAL_CMD_LINE_OTYPE_DVM },
|
||||||
|
|
||||||
|
/* look first for a system server */
|
||||||
|
{ NULL, '\0', "system-server-first", "system-server-first", 0,
|
||||||
|
&orte_cmd_options.system_server_first, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"First look for a system server and connect to it if found", OPAL_CMD_LINE_OTYPE_DVM },
|
||||||
|
|
||||||
|
/* connect only to a system server */
|
||||||
|
{ NULL, '\0', "system-server-only", "system-server-only", 0,
|
||||||
|
&orte_cmd_options.system_server_only, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Connect only to a system-level server", OPAL_CMD_LINE_OTYPE_DVM },
|
||||||
|
|
||||||
|
/* provide a connection PID */
|
||||||
|
{ NULL, '\0', "pid", "pid", 1,
|
||||||
|
&orte_cmd_options.pid, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"PID of the session-level daemon to which we should connect",
|
||||||
|
OPAL_CMD_LINE_OTYPE_DVM },
|
||||||
|
|
||||||
/* End of list */
|
/* End of list */
|
||||||
{ NULL, '\0', NULL, NULL, 0,
|
{ NULL, '\0', NULL, NULL, 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
@ -647,9 +666,38 @@ int prun(int argc, char *argv[])
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* tell the ess/tool component that we want to connect to a system-level
|
/* Check for help request */
|
||||||
|
if (orte_cmd_options.help) {
|
||||||
|
char *str, *args = NULL;
|
||||||
|
args = opal_cmd_line_get_usage_msg(orte_cmd_line);
|
||||||
|
str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
|
||||||
|
"prun", "PSVR", OPAL_VERSION,
|
||||||
|
"prun", args,
|
||||||
|
PACKAGE_BUGREPORT);
|
||||||
|
if (NULL != str) {
|
||||||
|
printf("%s", str);
|
||||||
|
free(str);
|
||||||
|
}
|
||||||
|
free(args);
|
||||||
|
|
||||||
|
/* If someone asks for help, that should be all we do */
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* tell the ess/tool component that we want to connect only to a system-level
|
||||||
* PMIx server */
|
* PMIx server */
|
||||||
opal_setenv("OMPI_MCA_ess_tool_system_server_only", "1", true, &environ);
|
if (orte_cmd_options.system_server_only) {
|
||||||
|
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_only", "1", true, &environ);
|
||||||
|
}
|
||||||
|
if (orte_cmd_options.system_server_first) {
|
||||||
|
opal_setenv(OPAL_MCA_PREFIX"ess_tool_system_server_first", "1", true, &environ);
|
||||||
|
}
|
||||||
|
/* if they specified the DVM's pid, then pass it along */
|
||||||
|
if (0 != orte_cmd_options.pid) {
|
||||||
|
asprintf(¶m, "%d", orte_cmd_options.pid);
|
||||||
|
opal_setenv(OPAL_MCA_PREFIX"ess_tool_server_pid", param, true, &environ);
|
||||||
|
free(param);
|
||||||
|
}
|
||||||
|
|
||||||
/* now initialize ORTE */
|
/* now initialize ORTE */
|
||||||
if (OPAL_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
if (OPAL_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
|
||||||
@ -665,7 +713,6 @@ int prun(int argc, char *argv[])
|
|||||||
val->type = OPAL_BOOL;
|
val->type = OPAL_BOOL;
|
||||||
val->data.flag = true;
|
val->data.flag = true;
|
||||||
opal_list_append(&info, &val->super);
|
opal_list_append(&info, &val->super);
|
||||||
|
|
||||||
fprintf(stderr, "TERMINATING DVM...");
|
fprintf(stderr, "TERMINATING DVM...");
|
||||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||||
rc = opal_pmix.job_control(NULL, &info, infocb, (void*)&lock);
|
rc = opal_pmix.job_control(NULL, &info, infocb, (void*)&lock);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user