Cleanup launch
Given that we only set OOB contact info from inside of events, or before we begin threaded operations (e.g., in the ess), allow set_contact_info to directly update the oob/base framework globals. Correct the nidmap regex decompression routine. Ensure that rank=1 daemon always sends back its topology as this is the most common use-case. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
6c6687116c
Коммит
399de0738e
@ -126,7 +126,7 @@ ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
|
|||||||
orte_oob_base_send_nb, cd); \
|
orte_oob_base_send_nb, cd); \
|
||||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
||||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
||||||
}while(0);
|
}while(0)
|
||||||
|
|
||||||
/* Our contact info is actually subject to change as transports
|
/* Our contact info is actually subject to change as transports
|
||||||
* can fail at any time. So a request to obtain our URI requires
|
* can fail at any time. So a request to obtain our URI requires
|
||||||
@ -175,12 +175,9 @@ OBJ_CLASS_DECLARATION(mca_oob_uri_req_t);
|
|||||||
mca_oob_uri_req_t *rq; \
|
mca_oob_uri_req_t *rq; \
|
||||||
rq = OBJ_NEW(mca_oob_uri_req_t); \
|
rq = OBJ_NEW(mca_oob_uri_req_t); \
|
||||||
rq->uri = strdup((u)); \
|
rq->uri = strdup((u)); \
|
||||||
opal_event_set(orte_oob_base.ev_base, &(rq)->ev, -1, \
|
orte_oob_base_set_addr(0, 0, (void*)rq); \
|
||||||
OPAL_EV_WRITE, \
|
}while(0)
|
||||||
orte_oob_base_set_addr, (rq)); \
|
|
||||||
opal_event_set_priority(&(rq)->ev, ORTE_MSG_PRI); \
|
|
||||||
opal_event_active(&(rq)->ev, OPAL_EV_WRITE, 1); \
|
|
||||||
}while(0);
|
|
||||||
ORTE_DECLSPEC void orte_oob_base_set_addr(int fd, short args, void *cbdata);
|
ORTE_DECLSPEC void orte_oob_base_set_addr(int fd, short args, void *cbdata);
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,12 +109,6 @@ static int orte_oob_base_close(void)
|
|||||||
|
|
||||||
OBJ_DESTRUCT(&orte_oob_base.peers);
|
OBJ_DESTRUCT(&orte_oob_base.peers);
|
||||||
|
|
||||||
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
|
|
||||||
opal_progress_thread_finalize(NULL);
|
|
||||||
} else {
|
|
||||||
opal_progress_thread_finalize("OOB-BASE");
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
|
OPAL_TIMING_EVENT((&tm_oob, "Finish"));
|
||||||
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
|
OPAL_TIMING_REPORT(orte_oob_base.timing, &tm_oob);
|
||||||
|
|
||||||
@ -133,11 +127,7 @@ static int orte_oob_base_open(mca_base_open_flag_t flags)
|
|||||||
opal_hash_table_init(&orte_oob_base.peers, 128);
|
opal_hash_table_init(&orte_oob_base.peers, 128);
|
||||||
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
|
OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t);
|
||||||
|
|
||||||
if (ORTE_PROC_IS_APP || ORTE_PROC_IS_TOOL) {
|
orte_oob_base.ev_base = orte_event_base;
|
||||||
orte_oob_base.ev_base = opal_progress_thread_init(NULL);
|
|
||||||
} else {
|
|
||||||
orte_oob_base.ev_base = opal_progress_thread_init("OOB-BASE");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_CR == 1
|
#if OPAL_ENABLE_FT_CR == 1
|
||||||
|
@ -835,7 +835,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
|
|||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:daemon_topology for daemon %s",
|
"%s plm:base:daemon_topology recvd for daemon %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(sender)));
|
ORTE_NAME_PRINT(sender)));
|
||||||
|
|
||||||
@ -938,7 +938,7 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
|
|||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:base:orted_report_launch %s for daemon %s",
|
"%s plm:base:orted:report_topo launch %s for daemon %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
orted_failed_launch ? "failed" : "completed",
|
orted_failed_launch ? "failed" : "completed",
|
||||||
ORTE_NAME_PRINT(sender)));
|
ORTE_NAME_PRINT(sender)));
|
||||||
@ -985,7 +985,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
char *rml_uri = NULL, *ptr;
|
char *rml_uri = NULL, *ptr;
|
||||||
int rc, idx;
|
int rc, idx;
|
||||||
orte_proc_t *daemon=NULL;
|
orte_proc_t *daemon=NULL;
|
||||||
orte_node_t *node;
|
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_process_name_t dname;
|
orte_process_name_t dname;
|
||||||
opal_buffer_t *relay;
|
opal_buffer_t *relay;
|
||||||
@ -994,7 +993,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
hwloc_topology_t topo;
|
hwloc_topology_t topo;
|
||||||
int i;
|
int i;
|
||||||
bool found;
|
bool found;
|
||||||
orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
|
orte_daemon_cmd_flag_t cmd;
|
||||||
|
|
||||||
/* get the daemon job, if necessary */
|
/* get the daemon job, if necessary */
|
||||||
if (NULL == jdatorted) {
|
if (NULL == jdatorted) {
|
||||||
@ -1054,8 +1053,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&daemon->name), nodename));
|
ORTE_NAME_PRINT(&daemon->name), nodename));
|
||||||
|
|
||||||
node = daemon->node;
|
|
||||||
|
|
||||||
/* look this node up, if necessary */
|
/* look this node up, if necessary */
|
||||||
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
|
if (!orte_plm_globals.daemon_nodes_assigned_at_launch) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
@ -1067,21 +1064,11 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
free(daemon->node->name);
|
free(daemon->node->name);
|
||||||
daemon->node->name = strdup(nodename);
|
daemon->node->name = strdup(nodename);
|
||||||
/* mark that it was verified */
|
/* mark that it was verified */
|
||||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_LOC_VERIFIED);
|
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_LOC_VERIFIED);
|
||||||
}
|
|
||||||
|
|
||||||
if (NULL == node) {
|
|
||||||
/* this shouldn't happen - it indicates an error in the
|
|
||||||
* prior node matching logic, so report it and error out
|
|
||||||
*/
|
|
||||||
orte_show_help("help-plm-base.txt", "daemon-no-assigned-node", true,
|
|
||||||
ORTE_NAME_PRINT(&daemon->name), nodename);
|
|
||||||
orted_failed_launch = true;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* mark the daemon as launched */
|
/* mark the daemon as launched */
|
||||||
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
|
ORTE_FLAG_SET(daemon->node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
|
||||||
|
|
||||||
if (orte_retain_aliases) {
|
if (orte_retain_aliases) {
|
||||||
char *alias, **atmp=NULL;
|
char *alias, **atmp=NULL;
|
||||||
@ -1113,7 +1100,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
alias = opal_argv_join(atmp, ',');
|
alias = opal_argv_join(atmp, ',');
|
||||||
opal_argv_free(atmp);
|
opal_argv_free(atmp);
|
||||||
orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
|
orte_set_attribute(&daemon->node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, alias, OPAL_STRING);
|
||||||
free(alias);
|
free(alias);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1130,7 +1117,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
|
|
||||||
/* rank=1 always sends its topology back */
|
/* rank=1 always sends its topology back */
|
||||||
topo = NULL;
|
topo = NULL;
|
||||||
if (1 == sender->vpid) {
|
if (1 == dname.vpid) {
|
||||||
idx=1;
|
idx=1;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
@ -1151,7 +1138,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
"%s TOPOLOGY ALREADY RECORDED",
|
"%s TOPOLOGY ALREADY RECORDED",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
found = true;
|
found = true;
|
||||||
node->topology = t;
|
daemon->node->topology = t;
|
||||||
if (NULL != topo) {
|
if (NULL != topo) {
|
||||||
hwloc_topology_destroy(topo);
|
hwloc_topology_destroy(topo);
|
||||||
}
|
}
|
||||||
@ -1167,12 +1154,18 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
t = OBJ_NEW(orte_topology_t);
|
t = OBJ_NEW(orte_topology_t);
|
||||||
t->sig = sig;
|
t->sig = sig;
|
||||||
opal_pointer_array_add(orte_node_topologies, t);
|
opal_pointer_array_add(orte_node_topologies, t);
|
||||||
node->topology = t;
|
daemon->node->topology = t;
|
||||||
if (NULL != topo) {
|
if (NULL != topo) {
|
||||||
t->topo = topo;
|
t->topo = topo;
|
||||||
} else {
|
} else {
|
||||||
|
/* nope - save the signature and request the complete topology from that node */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||||
|
"%s REQUESTING TOPOLOGY FROM %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&dname)));
|
||||||
/* construct the request */
|
/* construct the request */
|
||||||
relay = OBJ_NEW(opal_buffer_t);
|
relay = OBJ_NEW(opal_buffer_t);
|
||||||
|
cmd = ORTE_DAEMON_REPORT_TOPOLOGY_CMD;
|
||||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
|
if (OPAL_SUCCESS != (rc = opal_dss.pack(relay, &cmd, 1, ORTE_DAEMON_CMD))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
OBJ_RELEASE(relay);
|
OBJ_RELEASE(relay);
|
||||||
@ -1181,7 +1174,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
|||||||
}
|
}
|
||||||
/* send it */
|
/* send it */
|
||||||
orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||||
sender, relay,
|
&dname, relay,
|
||||||
ORTE_RML_TAG_DAEMON,
|
ORTE_RML_TAG_DAEMON,
|
||||||
orte_rml_send_callback, NULL);
|
orte_rml_send_callback, NULL);
|
||||||
/* we will count this node as completed
|
/* we will count this node as completed
|
||||||
|
@ -761,8 +761,10 @@ int orte_daemon(int argc, char *argv[])
|
|||||||
|
|
||||||
/* if we are rank=1, then send our topology back - otherwise, mpirun
|
/* if we are rank=1, then send our topology back - otherwise, mpirun
|
||||||
* will request it if necessary */
|
* will request it if necessary */
|
||||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
|
if (1 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
ORTE_ERROR_LOG(ret);
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* send to the HNP's callback - will be routed if routes are available */
|
/* send to the HNP's callback - will be routed if routes are available */
|
||||||
|
@ -516,9 +516,9 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
/* decode a nodemap for a daemon */
|
/* decode a nodemap for a daemon */
|
||||||
int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
||||||
{
|
{
|
||||||
int k, m, n, rc, start, endpt;
|
int m, n, rc;
|
||||||
orte_node_t *node;
|
orte_node_t *node;
|
||||||
size_t num_nodes;
|
size_t k, num_nodes, endpt;
|
||||||
orte_job_t *daemons;
|
orte_job_t *daemons;
|
||||||
orte_proc_t *dptr;
|
orte_proc_t *dptr;
|
||||||
char **nodes, *indices, *dvpids;
|
char **nodes, *indices, *dvpids;
|
||||||
@ -601,16 +601,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
|||||||
for (n=0; NULL != tmp[n]; n++) {
|
for (n=0; NULL != tmp[n]; n++) {
|
||||||
/* convert the number - since it might be a range,
|
/* convert the number - since it might be a range,
|
||||||
* save the remainder pointer */
|
* save the remainder pointer */
|
||||||
nodeids[k] = strtoul(tmp[n], &rmndr, 10);
|
nodeids[k++] = strtoul(tmp[n], &rmndr, 10);
|
||||||
if (NULL != rmndr) {
|
if (NULL != rmndr) {
|
||||||
/* it must be a range - find the endpoint */
|
/* it must be a range - find the endpoint */
|
||||||
++rmndr;
|
++rmndr;
|
||||||
|
m = nodeids[k-1] + 1;
|
||||||
endpt = strtoul(rmndr, NULL, 10);
|
endpt = strtoul(rmndr, NULL, 10);
|
||||||
start = nodeids[k] + 1;
|
while (k <= endpt && k < num_nodes) {
|
||||||
for (m=0; m < endpt; m++) {
|
nodeids[k++] = m++;
|
||||||
++k;
|
|
||||||
nodeids[k] = start + m;
|
|
||||||
}
|
}
|
||||||
|
--k; // step back to compensate for later increment
|
||||||
}
|
}
|
||||||
++k;
|
++k;
|
||||||
}
|
}
|
||||||
@ -624,16 +624,16 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer)
|
|||||||
for (n=0; NULL != tmp[n]; n++) {
|
for (n=0; NULL != tmp[n]; n++) {
|
||||||
/* convert the number - since it might be a range,
|
/* convert the number - since it might be a range,
|
||||||
* save the remainder pointer */
|
* save the remainder pointer */
|
||||||
dids[k] = strtoul(tmp[n], &rmndr, 10);
|
dids[k++] = strtoul(tmp[n], &rmndr, 10);
|
||||||
if (NULL != rmndr) {
|
if (NULL != rmndr) {
|
||||||
/* it must be a range - find the endpoint */
|
/* it must be a range - find the endpoint */
|
||||||
++rmndr;
|
++rmndr;
|
||||||
endpt = strtoul(rmndr, NULL, 10);
|
endpt = strtoul(rmndr, NULL, 10);
|
||||||
start = dids[k] + 1;
|
m = dids[k-1] + 1;
|
||||||
for (m=0; m < endpt; m++) {
|
while (k <= endpt && k < num_nodes) {
|
||||||
++k;
|
dids[k++] = m++;
|
||||||
dids[k] = start + m;
|
|
||||||
}
|
}
|
||||||
|
--k; // step back to compensate for later increment
|
||||||
}
|
}
|
||||||
++k;
|
++k;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user