orted: fix tree-spawn when the node regex is too long
When the node regex is too long to be sent on the command line, retrieve it first from the parent, and then spawn the remote orted Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Этот коммит содержится в:
родитель
799152e7fb
Коммит
4527584840
@ -1565,16 +1565,19 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL != orte_node_regex) {
|
||||
free(orte_node_regex);
|
||||
}
|
||||
orte_node_regex = param;
|
||||
/* if this is too long, then we'll have to do it with
|
||||
* a phone home operation instead */
|
||||
if (strlen(param) < orte_plm_globals.node_regex_threshold) {
|
||||
opal_argv_append(argc, argv, "-"OPAL_MCA_CMD_LINE_ID);
|
||||
opal_argv_append(argc, argv, "orte_node_regex");
|
||||
opal_argv_append(argc, argv, param);
|
||||
opal_argv_append(argc, argv, orte_node_regex);
|
||||
/* mark that the nidmap has been communicated */
|
||||
orte_nidmap_communicated = true;
|
||||
}
|
||||
free(param);
|
||||
|
||||
if (!orte_static_ports && !orte_fwd_mpirun_port) {
|
||||
/* if we are using static ports, or we are forwarding
|
||||
|
@ -825,7 +825,6 @@ static int remote_spawn(opal_buffer_t *launch)
|
||||
prefix = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* get the updated routing list */
|
||||
rtmod = orte_rml.get_routed(orte_coll_conduit);
|
||||
OBJ_CONSTRUCT(&coll, opal_list_t);
|
||||
|
@ -13,6 +13,8 @@
|
||||
* Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -172,8 +174,32 @@ void orte_rml_base_process_msg(int fd, short flags, void *cbdata)
|
||||
|
||||
/* if this message is just to warmup the connection, then drop it */
|
||||
if (ORTE_RML_TAG_WARMUP_CONNECTION == msg->tag) {
|
||||
OBJ_RELEASE(msg);
|
||||
return;
|
||||
if (!orte_nidmap_communicated) {
|
||||
opal_buffer_t * buffer = OBJ_NEW(opal_buffer_t);
|
||||
int rc;
|
||||
if (NULL == buffer) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return;
|
||||
}
|
||||
assert (NULL != orte_node_regex);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &orte_node_regex, 1, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
&msg->sender, buffer,
|
||||
ORTE_RML_TAG_NODE_REGEX_REPORT,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
return;
|
||||
}
|
||||
OBJ_RELEASE(msg);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* see if we have a waiting recv for this message */
|
||||
|
@ -13,6 +13,8 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -175,6 +177,9 @@ BEGIN_C_DECLS
|
||||
/* warmup connection - simply establishes the connection */
|
||||
#define ORTE_RML_TAG_WARMUP_CONNECTION 63
|
||||
|
||||
/* node regex report */
|
||||
#define ORTE_RML_TAG_NODE_REGEX_REPORT 64
|
||||
|
||||
#define ORTE_RML_TAG_MAX 100
|
||||
|
||||
|
||||
|
@ -117,8 +117,14 @@ static void pipe_closed(int fd, short flags, void *arg);
|
||||
static void rollup(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
static void node_regex_report(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
static void report_orted(void);
|
||||
|
||||
static opal_buffer_t *bucket, *mybucket = NULL;
|
||||
static int ncollected = 0;
|
||||
static bool node_regex_waiting = false;
|
||||
|
||||
static char *orte_parent_uri = NULL;
|
||||
|
||||
@ -734,6 +740,11 @@ int orte_daemon(int argc, char *argv[])
|
||||
* a little time in the launch phase by "warming up" the
|
||||
* connection to our parent while we wait for our children */
|
||||
buffer = OBJ_NEW(opal_buffer_t); // zero-byte message
|
||||
if (NULL == orte_node_regex) {
|
||||
orte_rml.recv_buffer_nb(ORTE_PROC_MY_PARENT, ORTE_RML_TAG_NODE_REGEX_REPORT,
|
||||
ORTE_RML_PERSISTENT, node_regex_report, &node_regex_waiting);
|
||||
node_regex_waiting = true;
|
||||
}
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
ORTE_PROC_MY_PARENT, buffer,
|
||||
ORTE_RML_TAG_WARMUP_CONNECTION,
|
||||
@ -969,8 +980,10 @@ int orte_daemon(int argc, char *argv[])
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
/* now launch any child daemons of ours */
|
||||
orte_plm.remote_spawn(orte_tree_launch_cmd);
|
||||
if (NULL != orte_node_regex) {
|
||||
/* now launch any child daemons of ours */
|
||||
orte_plm.remote_spawn(orte_tree_launch_cmd);
|
||||
}
|
||||
}
|
||||
|
||||
if (orte_debug_daemons_flag) {
|
||||
@ -1052,8 +1065,6 @@ static void rollup(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
int nreqd;
|
||||
char *rtmod;
|
||||
int ret;
|
||||
orte_process_name_t child;
|
||||
int32_t i, flag, cnt;
|
||||
@ -1095,10 +1106,17 @@ static void rollup(int status, orte_process_name_t* sender,
|
||||
}
|
||||
|
||||
report:
|
||||
report_orted();
|
||||
}
|
||||
|
||||
static void report_orted() {
|
||||
char *rtmod;
|
||||
int nreqd, ret;
|
||||
|
||||
/* get the number of children */
|
||||
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
|
||||
nreqd = orte_routed.num_routes(rtmod) + 1;
|
||||
if (nreqd == ncollected && NULL != mybucket) {
|
||||
if (nreqd == ncollected && NULL != mybucket && !node_regex_waiting) {
|
||||
/* add the collection of our children's buckets to ours */
|
||||
opal_dss.copy_payload(mybucket, bucket);
|
||||
OBJ_RELEASE(bucket);
|
||||
@ -1112,3 +1130,36 @@ static void rollup(int status, orte_process_name_t* sender,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void node_regex_report(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata) {
|
||||
int rc, n=1;
|
||||
char * regex;
|
||||
assert(NULL == orte_node_regex);
|
||||
bool * active = (bool *)cbdata;
|
||||
|
||||
/* extract the node regex if needed, and update the routing tree */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, ®ex, &n, OPAL_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
orte_node_regex = regex;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse(orte_node_regex))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/* update the routing tree so any tree spawn operation
|
||||
* properly gets the number of children underneath us */
|
||||
orte_routed.update_routing_plan(NULL);
|
||||
|
||||
*active = false;
|
||||
|
||||
/* now launch any child daemons of ours */
|
||||
orte_plm.remote_spawn(orte_tree_launch_cmd);
|
||||
|
||||
report_orted();
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user