1
1

Merge pull request #2282 from rhc54/topic/routed

Update ORTE to more fully support the new conduit messaging system
Этот коммит содержится в:
rhc54 2016-10-24 08:06:36 -07:00 коммит произвёл GitHub
родитель 055df6f7c6 649301a3a2
Коммит dae02c7e43
89 изменённых файлов: 1399 добавлений и 1973 удалений

Просмотреть файл

@ -13,7 +13,7 @@ enable_shared=yes
enable_static=no
enable_io_romio=no
enable_ipv6=no
enable_mpi_fortran=yes
enable_mpi_fortran=no
enable_mpi_cxx=no
enable_mpi_cxx_seek=no
enable_memchecker=no

Просмотреть файл

@ -1890,13 +1890,13 @@ static char *source_name(mca_base_var_t *var)
static int var_value_string (mca_base_var_t *var, char **value_string)
{
const mca_base_var_storage_t *value;
const mca_base_var_storage_t *value=NULL;
int ret;
assert (MCA_BASE_VAR_TYPE_MAX > var->mbv_type);
ret = mca_base_var_get_value(var->mbv_index, &value, NULL, NULL);
if (OPAL_SUCCESS !=ret) {
if (OPAL_SUCCESS != ret || NULL == value) {
return ret;
}

Просмотреть файл

@ -687,7 +687,7 @@ opal_pmix_data_range_t pmix3x_convert_range(pmix_data_range_t range) {
case PMIX_RANGE_CUSTOM:
return OPAL_PMIX_RANGE_CUSTOM;
default:
return OPAL_PMIX_SCOPE_UNDEF;
return OPAL_PMIX_RANGE_UNDEF;
}
}

Просмотреть файл

@ -687,7 +687,7 @@ opal_pmix_data_range_t pmix3x_convert_range(pmix_data_range_t range) {
case PMIX_RANGE_CUSTOM:
return OPAL_PMIX_RANGE_CUSTOM;
default:
return OPAL_PMIX_SCOPE_UNDEF;
return OPAL_PMIX_RANGE_UNDEF;
}
}

Просмотреть файл

@ -14,6 +14,7 @@
* Copyright (c) 2010-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2012 University of Houston. All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -626,12 +627,12 @@ static void opal_info_show_mca_group_params(const mca_base_var_group_t *group, m
/* read the selection parameter */
var_id = mca_base_var_find (group->group_project, group->group_framework, NULL, NULL);
if (0 <= var_id) {
const mca_base_var_storage_t *value;
const mca_base_var_storage_t *value=NULL;
char **requested_components;
bool include_mode;
mca_base_var_get_value (var_id, &value, NULL, NULL);
if (NULL != value->stringval && '\0' != value->stringval[0]) {
if (NULL != value && NULL != value->stringval && '\0' != value->stringval[0]) {
mca_base_component_parse_requested (value->stringval, &include_mode, &requested_components);
for (i = 0, requested = !include_mode ; requested_components[i] ; ++i) {

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -615,7 +615,8 @@ static void process_opens(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&daemon),
filename);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -721,7 +722,8 @@ static void process_close(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -843,7 +845,8 @@ static void process_sizes(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -997,7 +1000,8 @@ static void process_seeks(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1109,7 +1113,8 @@ static void process_reads(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1176,7 +1181,8 @@ static void process_posts(int fd, short args, void *cbdata)
goto error;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_DAEMON, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1237,7 +1243,8 @@ static void process_getfm(int fd, short args, void *cbdata)
goto error;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_DAEMON, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -414,7 +414,8 @@ static void process_opens(int fd, short args, void *cbdata)
free(filename);
filename = NULL;
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&node->daemon->name, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&node->daemon->name, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -525,7 +526,8 @@ static void process_close(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -630,7 +632,8 @@ static void process_sizes(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -778,7 +781,8 @@ static void process_seeks(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -888,7 +892,8 @@ static void process_reads(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1385,7 +1390,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
return;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1473,7 +1479,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
return;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1582,7 +1589,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)bytes_read,
ORTE_NAME_PRINT(sender));
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1696,7 +1704,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)bytes_read,
ORTE_NAME_PRINT(sender));
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1748,7 +1757,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
ORTE_ERROR_LOG(rc);
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1917,7 +1927,8 @@ static void recv_dfs_cmd(int status, orte_process_name_t* sender,
"%s getf-cmd: returning %d maps with %d bytes to sender %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nmaps,
(int)answer->bytes_used, ORTE_NAME_PRINT(sender));
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -2172,7 +2183,8 @@ static void remote_open(int fd, short args, void *cbdata)
return;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&req->trk->requestor, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->trk->requestor, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -2213,7 +2225,8 @@ static void remote_size(int fd, short args, void *cbdata)
return;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&req->trk->requestor, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->trk->requestor, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -2277,7 +2290,8 @@ static void remote_seek(int fd, short args, void *cbdata)
return;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&req->trk->requestor, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->trk->requestor, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -2344,7 +2358,8 @@ static void remote_read(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(long)bytes_read,
ORTE_NAME_PRINT(&req->trk->requestor));
if (0 > (rc = orte_rml.send_buffer_nb(&req->trk->requestor, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->trk->requestor, answer,
ORTE_RML_TAG_DFS_DATA,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -529,7 +529,8 @@ static void process_opens(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&daemon),
filename);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -637,7 +638,8 @@ static void process_close(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -737,7 +739,8 @@ static void process_sizes(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -847,7 +850,8 @@ static void process_seeks(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -943,7 +947,8 @@ static void process_reads(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&trk->host_daemon),
trk->local_fd);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(&trk->host_daemon, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&trk->host_daemon, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1010,7 +1015,8 @@ static void process_posts(int fd, short args, void *cbdata)
goto error;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_DAEMON, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -1071,7 +1077,8 @@ static void process_getfm(int fd, short args, void *cbdata)
goto error;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_DAEMON, buffer,
ORTE_RML_TAG_DFS_CMD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -9,7 +9,7 @@
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -31,6 +31,7 @@
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -208,7 +209,8 @@ static void job_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
@ -288,6 +290,7 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
char *rtmod;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_hnp: for proc %s state %s",
@ -308,6 +311,7 @@ static void proc_errors(int fd, short args, void *cbdata)
goto cleanup;
}
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
/* we MUST handle a communication failure before doing anything else
* as it requires some special care to avoid normal termination issues
@ -338,9 +342,9 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s Comm failure: daemons terminating - recording daemon %s as gone",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
orte_routed.route_lost(rtmod, proc);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
@ -361,7 +365,7 @@ static void proc_errors(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
(int)orte_routed.num_routes(rtmod)));
}
goto cleanup;
}
@ -410,7 +414,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:hnp all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -627,7 +631,7 @@ static void proc_errors(int fd, short args, void *cbdata)
default_hnp_abort(jdata);
}
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
orte_routed.route_lost(rtmod, proc);
break;
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:

Просмотреть файл

@ -34,6 +34,7 @@
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -191,7 +192,8 @@ static void job_errors(int fd, short args, void *cbdata)
goto cleanup;
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -208,7 +210,7 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_job_t *jdata;
orte_process_name_t *proc = &caddy->name;
orte_proc_state_t state = caddy->proc_state;
char *rtmod;
orte_proc_t *child, *ptr;
opal_buffer_t *alert;
orte_plm_cmd_flag_t cmd;
@ -267,6 +269,9 @@ static void proc_errors(int fd, short args, void *cbdata)
goto cleanup;
}
/* get our management conduit's routed module name */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (ORTE_PROC_STATE_COMM_FAILED == state) {
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
@ -336,7 +341,8 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s errmgr:default_orted reporting lost connection to daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -364,7 +370,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -373,7 +379,7 @@ static void proc_errors(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted not exiting, num_routes() == %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
(int)orte_routed.num_routes(rtmod)));
}
}
/* if not, then we can continue */
@ -433,7 +439,8 @@ static void proc_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&child->name),
jdata->num_local_procs));
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -490,7 +497,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:orted all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -532,7 +539,8 @@ static void proc_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&child->name),
jdata->num_local_procs));
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -587,7 +595,8 @@ static void proc_errors(int fd, short args, void *cbdata)
OBJ_RELEASE(jdata);
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -31,6 +31,7 @@
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
@ -213,9 +214,10 @@ static void job_errors(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
@ -288,6 +290,7 @@ static void proc_errors(int fd, short args, void *cbdata)
orte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
char *rtmod;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm: for proc %s state %s",
@ -309,6 +312,9 @@ static void proc_errors(int fd, short args, void *cbdata)
}
pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
/* get the management conduit's routed module */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
/* we MUST handle a communication failure before doing anything else
* as it requires some special care to avoid normal termination issues
* for local application procs
@ -342,9 +348,9 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s Comm failure: daemons terminating - recording daemon %s as gone",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
orte_routed.route_lost(rtmod, proc);
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
@ -365,7 +371,7 @@ static void proc_errors(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
(int)orte_routed.num_routes(rtmod)));
}
goto cleanup;
}
@ -412,7 +418,7 @@ static void proc_errors(int fd, short args, void *cbdata)
}
/* if all my routes and children are gone, then terminate
ourselves nicely (i.e., this is a normal termination) */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:default:dvm all routes gone - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -551,9 +557,10 @@ static void proc_errors(int fd, short args, void *cbdata)
opal_dss.pack(answer, &pptr->node, 1, ORTE_NODE);
}
/* return response */
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
@ -639,7 +646,7 @@ static void proc_errors(int fd, short args, void *cbdata)
_terminate_job(jdata->jobid);
}
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
orte_routed.route_lost(rtmod, proc);
break;
case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:

Просмотреть файл

@ -202,7 +202,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
goto error;
}
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;

Просмотреть файл

@ -115,6 +115,7 @@ int orte_ess_base_orted_setup(char **hosts)
char *param;
hwloc_obj_t obj;
unsigned i, j;
opal_list_t transports;
/* my name is set, xfer it to the OPAL layer */
orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
@ -427,14 +428,26 @@ int orte_ess_base_orted_setup(char **hosts)
error = "orte_routed_base_select";
goto error;
}
/* setup the routed info - the selected routed component
* will know what to do.
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
}
/* get a conduit for our use - we never route IO over fabric */
OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
orte_coll_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
/*
* Group communications
*/
@ -482,7 +495,7 @@ int orte_ess_base_orted_setup(char **hosts)
/* define the routing tree so we know the pattern
* if we are trying to setup common or static ports
*/
orte_routed.update_routing_plan();
orte_routed.update_routing_plan(NULL);
/* extract the node info from the environment and
* build a nidmap from it
*/
@ -497,7 +510,7 @@ int orte_ess_base_orted_setup(char **hosts)
* to mpirun goes through the tree if static ports were enabled - still
* need to do it anyway just to initialize things
*/
orte_routed.update_routing_plan();
orte_routed.update_routing_plan(NULL);
/* Now provide a chance for the PLM
* to perform any module-specific init functions. This
@ -514,10 +527,8 @@ int orte_ess_base_orted_setup(char **hosts)
}
}
/* setup the routed info - the selected routed component
* will know what to do.
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
@ -627,6 +638,10 @@ int orte_ess_base_orted_finalize(void)
pmix_server_finalize();
(void) mca_base_framework_close(&opal_pmix_base_framework);
/* release the conduits */
orte_rml.close_conduit(orte_mgmt_conduit);
orte_rml.close_conduit(orte_coll_conduit);
/* close frameworks */
(void) mca_base_framework_close(&orte_filem_base_framework);
(void) mca_base_framework_close(&orte_grpcomm_base_framework);

Просмотреть файл

@ -66,6 +66,7 @@ int orte_ess_base_tool_setup(void)
{
int ret;
char *error = NULL;
opal_list_t transports;
/* my name is set, xfer it to the OPAL layer */
orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
@ -97,6 +98,17 @@ int orte_ess_base_tool_setup(void)
goto error;
}
/* Setup the communication infrastructure */
/* Routed system */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
@ -118,17 +130,14 @@ int orte_ess_base_tool_setup(void)
error = "orte_rml_base_select";
goto error;
}
/* Routed system */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
/* get a conduit for our use - we never route IO over fabric */
OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
/* since I am a tool, then all I really want to do is communicate.
* So setup communications and be done - finding the HNP
* to which I want to communicate and setting up a route for
@ -148,10 +157,8 @@ int orte_ess_base_tool_setup(void)
goto error;
}
/* setup the routed info - the selected routed component
* will know what to do.
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
/* setup the routed info for all components */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
@ -230,6 +237,8 @@ int orte_ess_base_tool_finalize(void)
mca_base_framework_close(&orte_sstore_base_framework);
#endif
orte_rml.close_conduit(orte_mgmt_conduit);
/* if I am a tool, then all I will have done is
* a very small subset of orte_init - ensure that
* I only back those elements out

Просмотреть файл

@ -148,6 +148,7 @@ static int rte_init(void)
uint32_t h;
int idx;
orte_topology_t *t;
opal_list_t transports;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@ -311,6 +312,19 @@ static int rte_init(void)
}
/* Setup the communication infrastructure */
/*
* Routed system
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
/*
* OOB Layer
*/
@ -335,6 +349,35 @@ static int rte_init(void)
goto error;
}
/* get a conduit for our use - we never route IO over fabric */
OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
orte_mgmt_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
OBJ_CONSTRUCT(&transports, opal_list_t);
orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
orte_coll_conduit = orte_rml.open_conduit(&transports);
OPAL_LIST_DESTRUCT(&transports);
/*
* Group communications
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_select";
goto error;
}
/* setup the error manager */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
error = "orte_errmgr_base_select";
goto error;
@ -426,32 +469,7 @@ static int rte_init(void)
jdata->state = ORTE_JOB_STATE_RUNNING;
/* obviously, we have "reported" */
jdata->num_reported = 1;
/*
* Routed system
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
/*
* Group communications
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_select";
goto error;
}
/* Now provide a chance for the PLM
* to perform any module-specific init functions. This
* needs to occur AFTER the communications are setup
@ -615,10 +633,8 @@ static int rte_init(void)
/* set the event base */
opal_pmix_base_set_evbase(orte_event_base);
/* setup the routed info - the selected routed component
* will know what to do.
*/
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(NULL, ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
@ -791,6 +807,11 @@ static int rte_finalize(void)
/* output any lingering stdout/err data */
fflush(stdout);
fflush(stderr);
/* release the conduits */
orte_rml.close_conduit(orte_mgmt_conduit);
orte_rml.close_conduit(orte_coll_conduit);
(void) mca_base_framework_close(&orte_iof_base_framework);
(void) mca_base_framework_close(&orte_rtc_base_framework);
(void) mca_base_framework_close(&orte_odls_base_framework);

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -206,7 +207,8 @@ static void filem_base_process_get_proc_node_name_cmd(orte_process_name_t* sende
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_FILEM_BASE_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -296,7 +298,8 @@ static void filem_base_process_get_remote_path_cmd(orte_process_name_t* sender,
goto CLEANUP;
}
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer,
ORTE_RML_TAG_FILEM_BASE_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -914,7 +914,8 @@ static void send_complete(char *file, int status)
OBJ_RELEASE(buf);
return;
}
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_FILEM_BASE_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -36,7 +36,7 @@
#include "opal/mca/hwloc/hwloc.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/grpcomm/grpcomm.h"
@ -69,6 +69,7 @@ typedef struct {
opal_list_t actives;
opal_list_t ongoing;
opal_hash_table_t sig_table;
char *transports;
} orte_grpcomm_base_t;
ORTE_DECLSPEC extern orte_grpcomm_base_t orte_grpcomm_base;

Просмотреть файл

@ -92,7 +92,9 @@ static int orte_grpcomm_base_open(mca_base_open_flag_t flags)
return mca_base_framework_components_open(&orte_grpcomm_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, grpcomm, NULL, NULL, orte_grpcomm_base_open, orte_grpcomm_base_close,
MCA_BASE_FRAMEWORK_DECLARE(orte, grpcomm, "GRPCOMM", NULL,
orte_grpcomm_base_open,
orte_grpcomm_base_close,
mca_grpcomm_base_static_components, 0);
OBJ_CLASS_INSTANCE(orte_grpcomm_base_active_t,

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -211,6 +212,7 @@ orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig
orte_namelist_t *nm;
opal_list_t children;
size_t n;
char *routed;
/* search the existing tracker list to see if this already exists */
OPAL_LIST_FOREACH(coll, &orte_grpcomm_base.ongoing, orte_grpcomm_coll_t) {
@ -257,30 +259,41 @@ orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig
ORTE_ERROR_LOG(rc);
return NULL;
}
/* cycle thru the array of daemons and compare them to our
* children in the routing tree, counting the ones that match
* so we know how many daemons we should receive contributions from */
OBJ_CONSTRUCT(&children, opal_list_t);
orte_routed.get_routing_list(&children);
while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&children))) {
/* get the routed module for our conduit */
routed = orte_rml.get_routed(orte_coll_conduit);
if (NULL == routed) {
/* this conduit is not routed, so we expect all daemons
* to directly participate */
coll->nexpected = coll->ndmns;
} else {
/* cycle thru the array of daemons and compare them to our
* children in the routing tree, counting the ones that match
* so we know how many daemons we should receive contributions from */
OBJ_CONSTRUCT(&children, opal_list_t);
orte_routed.get_routing_list(routed, &children);
while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&children))) {
for (n=0; n < coll->ndmns; n++) {
if (nm->name.vpid == coll->dmns[n]) {
coll->nexpected++;
break;
}
}
OBJ_RELEASE(nm);
}
OPAL_LIST_DESTRUCT(&children);
/* see if I am in the array of participants - note that I may
* be in the rollup tree even though I'm not participating
* in the collective itself */
for (n=0; n < coll->ndmns; n++) {
if (nm->name.vpid == coll->dmns[n]) {
if (coll->dmns[n] == ORTE_PROC_MY_NAME->vpid) {
coll->nexpected++;
break;
}
}
OBJ_RELEASE(nm);
}
OPAL_LIST_DESTRUCT(&children);
/* see if I am in the array of participants - note that I may
* be in the rollup tree even though I'm not participating
* in the collective itself */
for (n=0; n < coll->ndmns; n++) {
if (coll->dmns[n] == ORTE_PROC_MY_NAME->vpid) {
coll->nexpected++;
break;
}
}
return coll;
}

Просмотреть файл

@ -26,6 +26,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/state/state.h"
#include "orte/util/name_fns.h"
@ -112,7 +113,8 @@ static int xcast(orte_vpid_t *vpids,
/* send it to the HNP (could be myself) for relay */
OBJ_RETAIN(buf); // we'll let the RML release it
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_XCAST,
if (0 > (rc = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_XCAST,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
@ -152,7 +154,8 @@ static int allgather(orte_grpcomm_coll_t *coll,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* send the info to ourselves for tracking */
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay,
rc = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_NAME, relay,
ORTE_RML_TAG_ALLGATHER_DIRECT,
orte_rml_send_callback, NULL);
return rc;
@ -243,7 +246,8 @@ static void allgather_recv(int status, orte_process_name_t* sender,
/* transfer the collected bucket */
opal_dss.copy_payload(reply, &coll->bucket);
/* send the info to our parent */
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_PARENT, reply,
rc = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_PARENT, reply,
ORTE_RML_TAG_ALLGATHER_DIRECT,
orte_rml_send_callback, NULL);
}
@ -268,6 +272,7 @@ static void xcast_recv(int status, orte_process_name_t* sender,
opal_list_t coll;
orte_grpcomm_signature_t *sig;
orte_rml_tag_t tag;
char *rtmod;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:direct:xcast:recv: with %d bytes",
@ -302,6 +307,9 @@ static void xcast_recv(int status, orte_process_name_t* sender,
/* setup the relay list */
OBJ_CONSTRUCT(&coll, opal_list_t);
/* get our conduit's routed module name */
rtmod = orte_rml.get_routed(orte_coll_conduit);
/* if this is headed for the daemon command processor,
* then we first need to check for add_local_procs
* as that command includes some needed wireup info */
@ -342,7 +350,7 @@ static void xcast_recv(int status, orte_process_name_t* sender,
}
/* update the routing plan */
orte_routed.update_routing_plan();
orte_routed.update_routing_plan(rtmod);
/* see if we have wiring info as well */
cnt=1;
@ -378,7 +386,7 @@ static void xcast_recv(int status, orte_process_name_t* sender,
OBJ_CONSTRUCT(&wireup, opal_buffer_t);
opal_dss.load(&wireup, bo->bytes, bo->size);
/* pass it for processing */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, &wireup))) {
if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(&wireup))) {
ORTE_ERROR_LOG(ret);
OBJ_DESTRUCT(&wireup);
goto relay;
@ -401,7 +409,7 @@ static void xcast_recv(int status, orte_process_name_t* sender,
relay:
/* get the list of next recipients from the routed module */
orte_routed.get_routing_list(&coll);
orte_routed.get_routing_list(rtmod, &coll);
/* if list is empty, no relay is required */
if (opal_list_is_empty(&coll)) {
@ -440,7 +448,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(item);
continue;
}
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(&nm->name, rly, ORTE_RML_TAG_XCAST,
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
&nm->name, rly, ORTE_RML_TAG_XCAST,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(rly);
@ -457,7 +466,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
/* now send the relay buffer to myself for processing */
if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, tag,
if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_NAME, relay, tag,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(relay);

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -159,7 +159,8 @@ static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name_
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer)));
if (0 > (rc = orte_rml.send_buffer_nb(peer, send_buf,
if (0 > (rc = orte_rml.send_buffer_nb(orte_coll_conduit,
peer, send_buf,
ORTE_RML_TAG_ALLGATHER_RCD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -50,6 +50,7 @@
#include "orte/mca/iof/iof.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml_types.h"
BEGIN_C_DECLS

Просмотреть файл

@ -37,6 +37,7 @@
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
@ -52,6 +53,122 @@
orte_iof_base_module_t orte_iof = {0};
/*
* Global variables
*/
orte_iof_base_t orte_iof_base = {0};
static int orte_iof_base_register(mca_base_register_flag_t flags)
{
/* check for maximum number of pending output messages */
orte_iof_base.output_limit = (size_t) INT_MAX;
(void) mca_base_var_register("orte", "iof", "base", "output_limit",
"Maximum backlog of output messages [default: unlimited]",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_iof_base.output_limit);
/* check for files to be sent to stdin of procs */
orte_iof_base.input_files = NULL;
(void) mca_base_var_register("orte", "iof","base", "input_files",
"Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_iof_base.input_files);
return ORTE_SUCCESS;
}
static int orte_iof_base_close(void)
{
/* Close the selected component */
if (NULL != orte_iof.finalize) {
orte_iof.finalize();
}
return mca_base_framework_components_close(&orte_iof_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_iof_base_open(mca_base_open_flag_t flags)
{
int rc, xmlfd;
/* did the user request we print output to files? */
if (NULL != orte_output_filename) {
/* we will setup the files themselves as needed in the iof
* module. For now, let's see if the filename contains a
* path, or just a name
*/
char *path;
path = opal_dirname(orte_output_filename);
if (NULL == path) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 != strcmp(path, orte_output_filename)) {
/* there is a path in this name - ensure that the directory
* exists, and create it if not
*/
if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
free(path);
return rc;
}
}
free(path);
}
/* daemons do not need to do this as they do not write out stdout/err */
if (!ORTE_PROC_IS_DAEMON) {
if (orte_xml_output) {
if (NULL != orte_xml_fp) {
/* user wants all xml-formatted output sent to file */
xmlfd = fileno(orte_xml_fp);
} else {
xmlfd = 1;
}
/* setup the stdout event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
xmlfd, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
/* don't create a stderr event - all output will go to
* the stdout channel
*/
} else {
/* setup the stdout event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
1, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
/* setup the stderr event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME,
2, ORTE_IOF_STDERR, orte_iof_base_write_handler);
}
/* do NOT set these file descriptors to non-blocking. If we do so,
* we set the file descriptor to non-blocking for everyone that has
* that file descriptor, which includes everyone else in our shell
* pipeline chain. (See
* http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
* This causes things like "mpirun -np 1 big_app | cat" to lose
* output, because cat's stdout is then ALSO non-blocking and cat
* isn't built to deal with that case (same with almost all other
* unix text utils).
*/
}
/* Open up all available components */
return mca_base_framework_components_open(&orte_iof_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, iof, "ORTE I/O Forwarding",
orte_iof_base_register, orte_iof_base_open, orte_iof_base_close,
mca_iof_base_static_components, 0);
/* class instances */
static void orte_iof_job_construct(orte_iof_job_t *ptr)
{
@ -195,119 +312,3 @@ OBJ_CLASS_INSTANCE(orte_iof_write_event_t,
OBJ_CLASS_INSTANCE(orte_iof_write_output_t,
opal_list_item_t,
NULL, NULL);
/*
* Global variables
*/
orte_iof_base_t orte_iof_base = {0};
static int orte_iof_base_register(mca_base_register_flag_t flags)
{
/* check for maximum number of pending output messages */
orte_iof_base.output_limit = (size_t) INT_MAX;
(void) mca_base_var_register("orte", "iof", "base", "output_limit",
"Maximum backlog of output messages [default: unlimited]",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_iof_base.output_limit);
/* check for files to be sent to stdin of procs */
orte_iof_base.input_files = NULL;
(void) mca_base_var_register("orte", "iof","base", "input_files",
"Comma-separated list of input files to be read and sent to stdin of procs (default: NULL)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&orte_iof_base.input_files);
return ORTE_SUCCESS;
}
static int orte_iof_base_close(void)
{
/* Close the selected component */
if (NULL != orte_iof.finalize) {
orte_iof.finalize();
}
return mca_base_framework_components_close(&orte_iof_base_framework, NULL);
}
/**
* Function for finding and opening either all MCA components, or the one
* that was specifically requested via a MCA parameter.
*/
static int orte_iof_base_open(mca_base_open_flag_t flags)
{
int rc, xmlfd;
/* did the user request we print output to files? */
if (NULL != orte_output_filename) {
/* we will setup the files themselves as needed in the iof
* module. For now, let's see if the filename contains a
* path, or just a name
*/
char *path;
path = opal_dirname(orte_output_filename);
if (NULL == path) {
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (0 != strcmp(path, orte_output_filename)) {
/* there is a path in this name - ensure that the directory
* exists, and create it if not
*/
if (ORTE_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
free(path);
return rc;
}
}
free(path);
}
/* daemons do not need to do this as they do not write out stdout/err */
if (!ORTE_PROC_IS_DAEMON) {
if (orte_xml_output) {
if (NULL != orte_xml_fp) {
/* user wants all xml-formatted output sent to file */
xmlfd = fileno(orte_xml_fp);
} else {
xmlfd = 1;
}
/* setup the stdout event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
xmlfd, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
/* don't create a stderr event - all output will go to
* the stdout channel
*/
} else {
/* setup the stdout event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stdout, ORTE_PROC_MY_NAME,
1, ORTE_IOF_STDOUT, orte_iof_base_write_handler);
/* setup the stderr event */
ORTE_IOF_SINK_DEFINE(&orte_iof_base.iof_write_stderr, ORTE_PROC_MY_NAME,
2, ORTE_IOF_STDERR, orte_iof_base_write_handler);
}
/* do NOT set these file descriptors to non-blocking. If we do so,
* we set the file descriptor to non-blocking for everyone that has
* that file descriptor, which includes everyone else in our shell
* pipeline chain. (See
* http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
* This causes things like "mpirun -np 1 big_app | cat" to lose
* output, because cat's stdout is then ALSO non-blocking and cat
* isn't built to deal with that case (same with almost all other
* unix text utils).
*/
}
/* Open up all available components */
return mca_base_framework_components_open(&orte_iof_base_framework, flags);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, iof, "ORTE I/O Forwarding",
orte_iof_base_register, orte_iof_base_open, orte_iof_base_close,
mca_iof_base_static_components, 0);

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC
* All rights reserved
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -109,8 +109,9 @@ int orte_iof_hnp_send_data_to_endpoint(orte_process_name_t *host,
/* send the buffer to the host - this is either a daemon or
* a tool that requested IOF
*/
if (0 > (rc = orte_rml.send_buffer_nb(host, buf, ORTE_RML_TAG_IOF_PROXY,
orte_rml_send_callback, NULL))) {
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
host, buf, ORTE_RML_TAG_IOF_PROXY,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -152,9 +152,10 @@ BEGIN_C_DECLS
opal_dss.pack(buf, (b), 1, ORTE_NAME); \
\
/* send the buffer to the HNP */ \
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, \
ORTE_RML_TAG_IOF_HNP, \
orte_rml_send_callback, NULL); \
orte_rml.send_buffer_nb(orte_mgmt_conduit, \
ORTE_PROC_MY_HNP, buf, \
ORTE_RML_TAG_IOF_HNP, \
orte_rml_send_callback, NULL); \
} while(0);
/* Initialize the selected module */

Просмотреть файл

@ -144,8 +144,9 @@ void orte_iof_orted_read_handler(int fd, short event, void *cbdata)
"%s iof:orted:read handler sending %d bytes to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes));
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
/* re-add the event */
opal_event_add(rev->ev, 0);

Просмотреть файл

@ -72,8 +72,9 @@ void orte_iof_orted_send_xonxoff(orte_iof_tag_t tag)
(ORTE_IOF_XON == tag) ? "xon" : "xoff"));
/* send the buffer to the HNP */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL))) {
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL))) {
ORTE_ERROR_LOG(rc);
}
}

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -166,8 +166,9 @@ static int tool_pull(const orte_process_name_t* src_name,
/* send the buffer to the correct HNP */
ORTE_HNP_NAME_FROM_JOB(&hnp, src_name->jobid);
orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
orte_rml.send_buffer_nb(orte_mgmt_conduit,
&hnp, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
return ORTE_SUCCESS;
}
@ -215,8 +216,9 @@ static int tool_close(const orte_process_name_t* src_name,
/* send the buffer to the correct HNP */
ORTE_HNP_NAME_FROM_JOB(&hnp, src_name->jobid);
orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
orte_rml.send_buffer_nb(orte_mgmt_conduit,
&hnp, buf, ORTE_RML_TAG_IOF_HNP,
send_cb, NULL);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -64,7 +64,7 @@
#include "orte/mca/ess/base/base.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/mca/state/state.h"
@ -159,7 +159,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
opal_dss.pack(data, &flag, 1, OPAL_INT8);
/* get wireup info for daemons per the selected routing module */
wireup = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, wireup))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
return rc;

Просмотреть файл

@ -2,7 +2,7 @@
/*
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -94,7 +94,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
component = (mca_oob_base_component_t*)cli->cli_component;
if (NULL != component->is_reachable) {
if (component->is_reachable(&msg->dst)) {
if (component->is_reachable(msg->routed, &msg->dst)) {
/* there is a way to reach this peer - record it
* so we don't waste this time again
*/
@ -154,7 +154,7 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
component = (mca_oob_base_component_t*)cli->cli_component;
/* is this peer reachable via this component? */
if (!component->is_reachable(&msg->dst)) {
if (!component->is_reachable(msg->routed, &msg->dst)) {
continue;
}
/* it is addressable, so attempt to send via that transport */

Просмотреть файл

@ -55,7 +55,7 @@ typedef int (*mca_oob_base_component_send_fn_t)(orte_rml_send_t *msg);
typedef char* (*mca_oob_base_component_get_addr_fn_t)(void);
typedef int (*mca_oob_base_component_set_addr_fn_t)(orte_process_name_t *peer,
char **uris);
typedef bool (*mca_oob_base_component_is_reachable_fn_t)(orte_process_name_t *peer);
typedef bool (*mca_oob_base_component_is_reachable_fn_t)(char *routed, orte_process_name_t *peer);
typedef void (*mca_oob_ping_callback_fn_t)(int status, void *cbdata);
typedef orte_rml_pathway_t* (*mca_oob_base_component_query_transports_fn_t)(void);

Просмотреть файл

@ -401,7 +401,7 @@ static void process_send(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&op->msg->dst), op->msg->tag, op->msg->seq_num);
/* do we have a route to this peer (could be direct)? */
hop = orte_routed.get_route(&op->msg->dst);
hop = orte_routed.get_route(op->msg->routed, &op->msg->dst);
/* do we know this hop? */
if (NULL == (peer = mca_oob_tcp_peer_lookup(&hop))) {
/* push this back to the component so it can try

Просмотреть файл

@ -95,7 +95,7 @@ static int component_send(orte_rml_send_t *msg);
static char* component_get_addr(void);
static int component_set_addr(orte_process_name_t *peer,
char **uris);
static bool component_is_reachable(orte_process_name_t *peer);
static bool component_is_reachable(char *rtmod, orte_process_name_t *peer);
static orte_rml_pathway_t* component_query_transports(void);
#if OPAL_ENABLE_FT_CR == 1
static int component_ft_event(int state);
@ -936,12 +936,12 @@ static int component_set_addr(orte_process_name_t *peer,
return ORTE_ERR_TAKE_NEXT_OPTION;
}
static bool component_is_reachable(orte_process_name_t *peer)
static bool component_is_reachable(char *rtmod, orte_process_name_t *peer)
{
orte_process_name_t hop;
/* if we have a route to this peer, then we can reach it */
hop = orte_routed.get_route(peer);
hop = orte_routed.get_route(rtmod, peer);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
@ -1026,7 +1026,7 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
if (!orte_finalizing) {
/* activate the proc state */
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
@ -1064,7 +1064,7 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
*/
if (!orte_finalizing && !orte_abnormal_term_ordered) {
/* if this was a lifeline, then alert */
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
if (ORTE_SUCCESS != orte_routed.route_lost(mop->snd->hdr.routed, &mop->hop)) {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
@ -1140,6 +1140,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
snd->count = mop->snd->hdr.nbytes;
snd->cbfunc.iov = NULL;
snd->cbdata = NULL;
snd->routed = strdup(mop->snd->hdr.routed);
/* activate the OOB send state */
ORTE_OOB_SEND(snd);
/* protect the data */
@ -1170,7 +1171,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(&pop->peer));
/* if this was a lifeline, then alert */
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
} else {
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
@ -1341,11 +1342,15 @@ OBJ_CLASS_INSTANCE(mca_oob_tcp_addr_t,
static void pop_cons(mca_oob_tcp_peer_op_t *pop)
{
pop->rtmod = NULL;
pop->net = NULL;
pop->port = NULL;
}
static void pop_des(mca_oob_tcp_peer_op_t *pop)
{
if (NULL != pop->rtmod) {
free(pop->rtmod);
}
if (NULL != pop->net) {
free(pop->net);
}

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved.
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -309,7 +309,7 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
* an event in the component event base, and so it will fire async
* from us if we are in our own progress thread
*/
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, mca_oob_tcp_component_failed_to_connect);
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, NULL, mca_oob_tcp_component_failed_to_connect);
/* FIXME: post any messages in the send queue back to the OOB
* level for reassignment
*/
@ -858,7 +858,7 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
/* set the peer into the component and OOB-level peer tables to indicate
* that we know this peer and we will be handling him
*/
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, mca_oob_tcp_component_set_module);
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, NULL, mca_oob_tcp_component_set_module);
/* connected */
tcp_peer_connected(peer);
@ -889,7 +889,7 @@ static void tcp_peer_connected(mca_oob_tcp_peer_t* peer)
}
/* update the route */
orte_routed.update_route(&peer->name, &peer->name);
orte_routed.update_route(NULL, &peer->name, &peer->name);
/* initiate send of first message on queue */
if (NULL == peer->send_msg) {
@ -949,7 +949,7 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t *peer)
/* inform the component-level that we have lost a connection so
* it can decide what to do about it.
*/
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, mca_oob_tcp_component_lost_connection);
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, NULL, mca_oob_tcp_component_lost_connection);
if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
/* nothing more to do */
@ -1158,7 +1158,7 @@ bool mca_oob_tcp_peer_accept(mca_oob_tcp_peer_t* peer)
/* set the peer into the component and OOB-level peer tables to indicate
* that we know this peer and we will be handling him
*/
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, mca_oob_tcp_component_set_module);
ORTE_ACTIVATE_TCP_CMP_OP(&peer->name, NULL, mca_oob_tcp_component_set_module);
tcp_peer_connected(peer);
if (!peer->recv_ev_active) {

Просмотреть файл

@ -39,6 +39,8 @@ typedef enum {
MCA_OOB_TCP_USER
} mca_oob_tcp_msg_type_t;
#define ORTE_MAX_RTD_SIZE 31
/* header for tcp msgs */
typedef struct {
/* the originator of the message - if we are routing,
@ -60,6 +62,8 @@ typedef struct {
uint32_t seq_num;
/* number of bytes in message */
uint32_t nbytes;
/* routed module to be used */
char routed[ORTE_MAX_RTD_SIZE+1];
} mca_oob_tcp_hdr_t;
/**
* Convert the message header to host byte order

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -69,6 +69,7 @@ typedef struct {
uint16_t af_family;
char *net;
char *port;
char *rtmod;
} mca_oob_tcp_peer_op_t;
OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t);
@ -91,12 +92,17 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_peer_op_t);
opal_event_active(&pop->ev, OPAL_EV_WRITE, 1); \
} while(0);
#define ORTE_ACTIVATE_TCP_CMP_OP(p, cbfunc) \
#define ORTE_ACTIVATE_TCP_CMP_OP(p, r, cbfunc) \
do { \
mca_oob_tcp_peer_op_t *pop; \
char *proxy; \
pop = OBJ_NEW(mca_oob_tcp_peer_op_t); \
pop->peer.jobid = (p)->jobid; \
pop->peer.vpid = (p)->vpid; \
proxy = (r); \
if (NULL != proxy) { \
pop->rtmod = strdup(proxy); \
} \
opal_event_set(mca_oob_tcp_module.ev_base, &pop->ev, -1, \
OPAL_EV_WRITE, (cbfunc), pop); \
opal_event_set_priority(&pop->ev, ORTE_MSG_PRI); \

Просмотреть файл

@ -574,6 +574,7 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
snd->data = peer->recv_msg->data;
snd->seq_num = peer->recv_msg->hdr.seq_num;
snd->count = peer->recv_msg->hdr.nbytes;
snd->routed = strdup(peer->recv_msg->hdr.routed);
snd->cbfunc.iov = NULL;
snd->cbdata = NULL;
/* activate the OOB send state */
@ -613,6 +614,7 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
static void snd_cons(mca_oob_tcp_send_t *ptr)
{
memset(&ptr->hdr, 0, sizeof(mca_oob_tcp_hdr_t));
ptr->msg = NULL;
ptr->data = NULL;
ptr->hdr_sent = false;
@ -638,6 +640,7 @@ OBJ_CLASS_INSTANCE(mca_oob_tcp_send_t,
static void rcv_cons(mca_oob_tcp_recv_t *ptr)
{
memset(&ptr->hdr, 0, sizeof(mca_oob_tcp_hdr_t));
ptr->hdr_recvd = false;
ptr->rdptr = NULL;
ptr->rdbytes = 0;

Просмотреть файл

@ -118,6 +118,10 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
msg->hdr.type = MCA_OOB_TCP_USER; \
msg->hdr.tag = (m)->tag; \
msg->hdr.seq_num = (m)->seq_num; \
if (NULL != (m)->routed) { \
(void)strncpy(msg->hdr.routed, (m)->routed, \
ORTE_MAX_RTD_SIZE); \
} \
/* point to the actual message */ \
msg->msg = (m); \
/* set the total number of bytes to be sent */ \
@ -162,6 +166,10 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
msg->hdr.type = MCA_OOB_TCP_USER; \
msg->hdr.tag = (m)->tag; \
msg->hdr.seq_num = (m)->seq_num; \
if (NULL != (m)->routed) { \
(void)strncpy(msg->hdr.routed, (m)->routed, \
ORTE_MAX_RTD_SIZE); \
} \
/* point to the actual message */ \
msg->msg = (m); \
/* set the total number of bytes to be sent */ \
@ -204,6 +212,8 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_recv_t);
msg->hdr.dst = (m)->hdr.dst; \
msg->hdr.type = MCA_OOB_TCP_USER; \
msg->hdr.tag = (m)->hdr.tag; \
(void)strncpy(msg->hdr.routed, (m)->hdr.routed, \
ORTE_MAX_RTD_SIZE); \
/* point to the actual message */ \
msg->data = (m)->data; \
/* set the total number of bytes to be sent */ \

Просмотреть файл

@ -7,7 +7,7 @@
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,7 +41,7 @@ static int mca_oob_ud_component_send_nb(orte_rml_send_t *msg);
static void mca_oob_ud_component_shutdown(void);
static char* mca_oob_ud_component_get_addr(void);
static int mca_oob_ud_component_set_addr(orte_process_name_t *peer, char **uris);
static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer);
static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer);
#if OPAL_ENABLE_FT_CR == 1
static int mca_oob_ud_component_ft_event(int state);
#endif // OPAL_ENABLE_FT_CR
@ -591,12 +591,12 @@ static int mca_oob_ud_port_alloc_buffers (mca_oob_ud_port_t *port) {
return rc;
}
static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer_name)
static bool mca_oob_ud_component_is_reachable(char *routed, orte_process_name_t *peer_name)
{
orte_process_name_t hop;
/* if we have a route to this peer, then we can reach it */
hop = orte_routed.get_route(peer_name);
hop = orte_routed.get_route(routed, peer_name);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);

Просмотреть файл

@ -4,7 +4,7 @@
* reserved.
* 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -129,7 +129,7 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
}
/* if we have a route to this peer, then we can reach it */
hop = orte_routed.get_route(&op->msg->dst);
hop = orte_routed.get_route(NULL, &op->msg->dst);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);

Просмотреть файл

@ -47,7 +47,7 @@
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rmaps/base/base.h"
@ -447,8 +447,8 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* ensure our routing plan is up-to-date */
orte_routed.update_routing_plan();
/* ensure all routing plans are up-to-date */
orte_routed.update_routing_plan(NULL);
/* If this job is being started by me, then there is nothing
* further we need to do as any user directives (e.g., to tie
@ -741,7 +741,8 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
@ -826,7 +827,8 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(&jdata->originator, answer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
@ -874,7 +876,7 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
idx = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
char *nodename;
char *nodename = NULL;
/* unpack its contact info */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) {
@ -1194,7 +1196,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
/* if a tree-launch is underway, send the cmd back */
relay = OBJ_NEW(opal_buffer_t);
opal_dss.copy_payload(relay, orte_tree_launch_cmd);
orte_rml.send_buffer_nb(sender, relay,
orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, relay,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
}
@ -2114,8 +2117,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* ensure our routing plan is up-to-date */
orte_routed.update_routing_plan();
/* ensure all routing plans are up-to-date */
orte_routed.update_routing_plan(NULL);
}
/* mark that the daemon job changed */

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -269,7 +269,8 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
}
/* send the response back to the sender */
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_LAUNCH_RESP,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);

Просмотреть файл

@ -14,7 +14,7 @@
* reserved.
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011 IBM Corporation. All rights reserved.
* Copyright (c) 2014-2015 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -86,6 +86,7 @@
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/base/rml_contact.h"
@ -261,6 +262,7 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
{
orte_job_t *jdata;
orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
char *rtmod;
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
/* ignore any such report - it will occur if we left the
@ -270,8 +272,8 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
return;
}
if (! WIFEXITED(daemon->exit_code) ||
! WEXITSTATUS(daemon->exit_code) == 0) { /* if abnormal exit */
if (!WIFEXITED(daemon->exit_code) ||
WEXITSTATUS(daemon->exit_code) != 0) { /* if abnormal exit */
/* if we are not the HNP, send a message to the HNP alerting it
* to the failure
*/
@ -284,7 +286,8 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
buf = OBJ_NEW(opal_buffer_t);
opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID);
opal_dss.pack(buf, &daemon->exit_code, 1, OPAL_INT);
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
orte_rml_send_callback, NULL);
/* note that this daemon failed */
@ -305,7 +308,8 @@ static void rsh_wait_daemon(orte_proc_t *daemon, void* cbdata)
/* remove it from the routing table to ensure num_routes
* returns the correct value
*/
orte_routed.route_lost(&daemon->name);
rtmod = orte_rml.get_routed(orte_coll_conduit);
orte_routed.route_lost(rtmod, &daemon->name);
/* report that the daemon has failed so we can exit */
ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
}
@ -782,6 +786,7 @@ static int remote_spawn(opal_buffer_t *launch)
orte_job_t *daemons;
opal_list_t coll;
orte_namelist_t *child;
char *rtmod;
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:rsh: remote spawn called",
@ -810,11 +815,12 @@ static int remote_spawn(opal_buffer_t *launch)
}
/* ensure the routing plan is updated */
orte_routed.update_routing_plan();
rtmod = orte_rml.get_routed(orte_coll_conduit);
orte_routed.update_routing_plan(rtmod);
/* get the updated routing list */
OBJ_CONSTRUCT(&coll, opal_list_t);
orte_routed.get_routing_list(&coll);
orte_routed.get_routing_list(rtmod, &coll);
/* if I have no children, just return */
if (0 == opal_list_get_size(&coll)) {
@ -904,7 +910,8 @@ cleanup:
buf = OBJ_NEW(opal_buffer_t);
opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID);
opal_dss.pack(buf, &rc, 1, OPAL_INT);
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
orte_rml_send_callback, NULL);
}
@ -992,6 +999,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
char *username;
int port, *portptr;
orte_namelist_t *child;
char *rtmod;
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
@ -1158,7 +1166,8 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* get the updated routing list */
OBJ_CONSTRUCT(&coll, opal_list_t);
orte_routed.get_routing_list(&coll);
rtmod = orte_rml.get_routed(orte_coll_conduit);
orte_routed.get_routing_list(rtmod, &coll);
}
/* setup the launch */

Просмотреть файл

@ -48,6 +48,7 @@
#include "opal/class/opal_pointer_array.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
@ -101,7 +102,6 @@ typedef struct {
opal_pointer_array_t conduits; /* array to hold the open conduits */
opal_list_t posted_recvs;
opal_list_t unmatched_msgs;
orte_rml_conduit_t def_conduit_id;
#if OPAL_ENABLE_TIMING
bool timing;
#endif
@ -135,6 +135,8 @@ typedef struct {
* transfers
*/
char *data;
/* routed module to be used */
char *routed;
} orte_rml_send_t;
OBJ_CLASS_DECLARATION(orte_rml_send_t);
@ -248,31 +250,21 @@ ORTE_DECLSPEC void orte_rml_base_complete_recv_msg (orte_rml_recv_t **recv_msg);
char* orte_rml_API_get_contact_info(void);
void orte_rml_API_set_contact_info(const char *contact_info);
int orte_rml_API_ping(const char* contact_info,
int orte_rml_API_ping(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv);
int orte_rml_API_ping_conduit(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv);
int orte_rml_API_send_nb(orte_process_name_t* peer, struct iovec* msg,
int orte_rml_API_send_nb(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer, struct iovec* msg,
int count, orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc, void* cbdata);
int orte_rml_API_send_nb_conduit(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer, struct iovec* msg,
int count, orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc, void* cbdata);
int orte_rml_API_send_buffer_nb(orte_process_name_t* peer,
int orte_rml_API_send_buffer_nb(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata);
int orte_rml_API_send_buffer_nb_conduit(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata);
void orte_rml_API_recv_nb(orte_process_name_t* peer,
orte_rml_tag_t tag,
@ -295,6 +287,8 @@ orte_rml_conduit_t orte_rml_API_open_conduit(opal_list_t *attributes);
void orte_rml_API_close_conduit(orte_rml_conduit_t id);
char* orte_rml_API_get_routed(orte_rml_conduit_t id);
END_C_DECLS
#endif /* MCA_RML_BASE_H */

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -73,14 +74,10 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
orte_std_cntr_t cnt;
orte_vpid_t num_procs;
char *rml_uri;
orte_process_name_t name;
bool got_name;
int rc;
/* unpack the data for each entry */
num_procs = 0;
name.jobid = ORTE_JOBID_INVALID;
got_name = false;
cnt = 1;
while (ORTE_SUCCESS == (rc = opal_dss.unpack(data, &rml_uri, &cnt, OPAL_STRING))) {
@ -92,26 +89,6 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
if (NULL != rml_uri) {
/* set the contact info into the hash table */
orte_rml.set_contact_info(rml_uri);
if (!got_name) {
/* we only get an update from a single jobid - the command
* that creates these doesn't cross jobid boundaries - so
* record it here
*/
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) {
ORTE_ERROR_LOG(rc);
free(rml_uri);
return rc;
}
got_name = true;
/* if this is for a different job family, update the route to this proc */
if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&name, &name))) {
ORTE_ERROR_LOG(rc);
free(rml_uri);
return rc;
}
}
}
free(rml_uri);
}
@ -123,14 +100,12 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
return rc;
}
/* if we are a daemon and this was info about our jobid, this update would
* include updated contact info
/* if we are a daemon, this update would include updated contact info
* for all daemons in the system - indicating that the number of daemons
* changed since we were initially launched. Thus, update the num_procs
* in our process_info struct so we can correctly route any messages
*/
if (ORTE_PROC_MY_NAME->jobid == name.jobid &&
ORTE_PROC_IS_DAEMON &&
if (ORTE_PROC_IS_DAEMON &&
orte_process_info.num_procs < num_procs) {
orte_process_info.num_procs = num_procs;
@ -139,9 +114,9 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
}
/* if we changed it, then we better update the routing
* plan so daemon collectives work correctly.
* plans so daemon collectives work correctly.
*/
orte_routed.update_routing_plan();
orte_routed.update_routing_plan(NULL);
}
return ORTE_SUCCESS;
@ -171,8 +146,8 @@ orte_rml_base_parse_uris(const char* uri,
}
if (NULL != uris) {
/* parse the remainder of the string into an array of uris */
*uris = opal_argv_split(ptr, ';');
/* parse the remainder of the string into an array of uris */
*uris = opal_argv_split(ptr, ';');
}
free(cinfo);
return ORTE_SUCCESS;

Просмотреть файл

@ -42,18 +42,16 @@ orte_rml_base_API_t orte_rml = {
.get_contact_info = orte_rml_API_get_contact_info,
.set_contact_info = orte_rml_API_set_contact_info,
.ping = orte_rml_API_ping,
.ping_conduit = orte_rml_API_ping_conduit,
.send_nb = orte_rml_API_send_nb,
.send_buffer_nb = orte_rml_API_send_buffer_nb,
.send_nb_conduit = orte_rml_API_send_nb_conduit,
.send_buffer_nb_conduit = orte_rml_API_send_buffer_nb_conduit,
.recv_nb = orte_rml_API_recv_nb,
.recv_buffer_nb = orte_rml_API_recv_buffer_nb,
.recv_cancel = orte_rml_API_recv_cancel,
.purge = orte_rml_API_purge,
.query_transports = orte_rml_API_query_transports,
.open_conduit = orte_rml_API_open_conduit,
.close_conduit = orte_rml_API_close_conduit
.close_conduit = orte_rml_API_close_conduit,
.get_routed = orte_rml_API_get_routed
};
orte_rml_base_t orte_rml_base = {{{0}}};
@ -164,7 +162,6 @@ int orte_rml_base_select(void)
orte_rml_component_t *component=NULL;
orte_rml_base_active_t *newmodule, *mod;
bool inserted;
opal_list_t conduit_attr;
if (selected) {
return ORTE_SUCCESS;
@ -208,29 +205,9 @@ int orte_rml_base_select(void)
}
}
/* Open the default oob conduit */
opal_output_verbose(10, orte_rml_base_framework.framework_output,
"%s Opening the default conduit - oob component",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
OBJ_CONSTRUCT(&conduit_attr, opal_list_t);
orte_set_attribute(&conduit_attr, ORTE_RML_INCLUDE_COMP_ATTRIB, ORTE_ATTR_LOCAL,"oob",OPAL_STRING);
orte_rml_base.def_conduit_id = orte_rml_API_open_conduit(&conduit_attr);
OPAL_LIST_DESTRUCT(&conduit_attr);
if (0 <= orte_rml_base.def_conduit_id) {
opal_output_verbose(10, orte_rml_base_framework.framework_output,
"%s Default conduit (oob) opened with conduit id = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rml_base.def_conduit_id);
} else {
opal_output_verbose(1, orte_rml_base_framework.framework_output,
"%s Default conduit (oob) could not be opened",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
return ORTE_SUCCESS;
}
void orte_rml_send_callback(int status, orte_process_name_t *peer,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
@ -266,10 +243,17 @@ static void send_cons(orte_rml_send_t *ptr)
ptr->buffer = NULL;
ptr->data = NULL;
ptr->seq_num = 0xFFFFFFFF;
ptr->routed = NULL;
}
static void send_des(orte_rml_send_t *ptr)
{
if (NULL != ptr->routed) {
free(ptr->routed);
}
}
OBJ_CLASS_INSTANCE(orte_rml_send_t,
opal_list_item_t,
send_cons, NULL);
send_cons, send_des);
static void send_req_cons(orte_rml_send_request_t *ptr)

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -117,7 +118,8 @@ orte_rml_base_recv(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
buf = OBJ_NEW(opal_buffer_t);
if (0 > (rc = orte_rml.send_buffer_nb(sender, buf, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, buf, ORTE_RML_TAG_UPDATE_ROUTE_ACK,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);

Просмотреть файл

@ -162,9 +162,9 @@ void orte_rml_API_set_contact_info(const char *contact_info)
}
/** Ping process for connectivity check */
int orte_rml_API_ping_conduit(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv)
int orte_rml_API_ping(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv)
{
int rc = ORTE_ERR_UNREACH;
orte_rml_base_module_t *mod;
@ -185,27 +185,20 @@ int orte_rml_API_ping_conduit(orte_rml_conduit_t conduit_id,
}
/** Ping process for connectivity check */
int orte_rml_API_ping(const char* contact_info,
const struct timeval* tv)
{
return orte_rml_API_ping_conduit(orte_rml_base.def_conduit_id, contact_info, tv);
}
/** Send non-blocking iovec message through a specific conduit*/
int orte_rml_API_send_nb_conduit(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct iovec* msg,
int count,
orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc,
void* cbdata)
int orte_rml_API_send_nb(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct iovec* msg,
int count,
orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc,
void* cbdata)
{
int rc = ORTE_ERR_UNREACH;
orte_rml_base_module_t *mod;
opal_output_verbose(10,orte_rml_base_framework.framework_output,
"%s rml:base:send_nb_conduit() to peer %s through conduit %d",
"%s rml:base:send_nb() to peer %s through conduit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer),conduit_id);
/* get the module */
@ -220,20 +213,21 @@ int orte_rml_API_send_nb_conduit(orte_rml_conduit_t conduit_id,
}
/** Send non-blocking buffer message */
int orte_rml_API_send_buffer_nb_conduit(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata)
int orte_rml_API_send_buffer_nb(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata)
{
int rc = ORTE_ERR_UNREACH;
orte_rml_base_module_t *mod;
opal_output_verbose(10,orte_rml_base_framework.framework_output,
"%s rml:base:send_buffer_nb_conduit() to peer %s through conduit %d",
"%s rml:base:send_buffer_nb() to peer %s through conduit %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer),conduit_id);
/* get the module */
if (NULL == (mod = (orte_rml_base_module_t*)opal_pointer_array_get_item(&orte_rml_base.conduits, conduit_id))) {
return rc;
@ -245,28 +239,6 @@ int orte_rml_API_send_buffer_nb_conduit(orte_rml_conduit_t conduit_id,
return rc;
}
/** Send non-blocking iovec message through a specific conduit*/
int orte_rml_API_send_nb(orte_process_name_t* peer,
struct iovec* msg,
int count,
orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc,
void* cbdata)
{
return orte_rml_API_send_nb_conduit(orte_rml_base.def_conduit_id, peer, msg, count, tag, cbfunc, cbdata);
}
/** Send non-blocking buffer message */
int orte_rml_API_send_buffer_nb(orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata)
{
return orte_rml_API_send_buffer_nb_conduit(orte_rml_base.def_conduit_id, peer, buffer, tag, cbfunc, cbdata);
}
/** post a receive for an IOV message - this is done
* strictly in the base, and so it does not go to a module */
void orte_rml_API_recv_nb(orte_process_name_t* peer,
@ -397,3 +369,15 @@ int orte_rml_API_query_transports(opal_list_t *providers)
return ORTE_SUCCESS;
}
char* orte_rml_API_get_routed(orte_rml_conduit_t id)
{
orte_rml_base_module_t *mod;
/* get the module */
if (NULL != (mod = (orte_rml_base_module_t*)opal_pointer_array_get_item(&orte_rml_base.conduits, id))) {
return mod->routed;
}
return NULL;
}

Просмотреть файл

@ -41,6 +41,7 @@ typedef struct {
opal_list_t queued_routing_messages;
opal_event_t *timer_event;
struct timeval timeout;
char *routed; // name of routed module to be used
} orte_rml_oob_module_t;
ORTE_MODULE_DECLSPEC extern orte_rml_component_t mca_rml_oob_component;

Просмотреть файл

@ -52,6 +52,7 @@
#include "orte/mca/oob/oob.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/routed/routed.h"
#include "rml_oob.h"
static int rml_oob_open(void);
@ -136,6 +137,7 @@ static orte_rml_base_module_t* make_module(void)
/* initialize its internal storage */
OBJ_CONSTRUCT(&mod->queued_routing_messages, opal_list_t);
mod->timer_event = NULL;
mod->routed = NULL;
/* return the result */
return (orte_rml_base_module_t*)mod;
@ -143,46 +145,113 @@ static orte_rml_base_module_t* make_module(void)
static orte_rml_base_module_t* open_conduit(opal_list_t *attributes)
{
char *comp_attrib = NULL;
char *comp_attrib;
char **comps;
int i;
orte_attribute_t *attr;
orte_rml_base_module_t *md;
opal_output_verbose(20,orte_rml_base_framework.framework_output,
"%s - Entering rml_oob_open_conduit()",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* someone may require this specific component, so look for "oob" */
comp_attrib = NULL;
if (orte_get_attribute(attributes, ORTE_RML_INCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) {
/* they specified specific components - could be multiple */
comps = opal_argv_split(comp_attrib, ',');
for (i=0; NULL != comps[i]; i++) {
if (0 == strcmp(comps[i], "oob")) {
if (0 == strcasecmp(comps[i], "oob")) {
/* we are a candidate */
opal_argv_free(comps);
return make_module();
md = make_module();
free(comp_attrib);
comp_attrib = NULL;
orte_get_attribute(attributes, ORTE_RML_ROUTED_ATTRIB, (void**)&comp_attrib, OPAL_STRING);
/* the routed system understands a NULL request, so no need to check
* return status/value here */
md->routed = orte_routed.assign_module(comp_attrib);
if (NULL != comp_attrib) {
free(comp_attrib);
}
return md;
}
}
/* we are not a candidate */
opal_argv_free(comps);
free(comp_attrib);
return NULL;
} else if (orte_get_attribute(attributes, ORTE_RML_EXCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) {
}
comp_attrib = NULL;
if (orte_get_attribute(attributes, ORTE_RML_EXCLUDE_COMP_ATTRIB, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) {
/* see if we are on the list */
comps = opal_argv_split(comp_attrib, ',');
for (i=0; NULL != comps[i]; i++) {
if (0 == strcmp(comps[i], "oob")) {
if (0 == strcasecmp(comps[i], "oob")) {
/* we cannot be a candidate */
opal_argv_free(comps);
free(comp_attrib);
return NULL;
}
}
}
/* Alternatively, check the attributes to see if we qualify - we only handle
* "routed", "Ethernet", and "TCP" */
OPAL_LIST_FOREACH(attr, attributes, orte_attribute_t) {
* "Ethernet" and "TCP" */
comp_attrib = NULL;
if (orte_get_attribute(attributes, ORTE_RML_TRANSPORT_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) {
comps = opal_argv_split(comp_attrib, ',');
for (i=0; NULL != comps[i]; i++) {
if (0 == strcasecmp(comps[i], "Ethernet")) {
/* we are a candidate */
opal_argv_free(comps);
md = make_module();
free(comp_attrib);
comp_attrib = NULL;
orte_get_attribute(attributes, ORTE_RML_ROUTED_ATTRIB, (void**)&comp_attrib, OPAL_STRING);
/* the routed system understands a NULL request, so no need to check
* return status/value here */
md->routed = orte_routed.assign_module(comp_attrib);
if (NULL != comp_attrib) {
free(comp_attrib);
}
return md;
}
}
/* we are not a candidate */
opal_argv_free(comps);
free(comp_attrib);
return NULL;
}
comp_attrib = NULL;
if (orte_get_attribute(attributes, ORTE_RML_PROTOCOL_TYPE, (void**)&comp_attrib, OPAL_STRING) &&
NULL != comp_attrib) {
comps = opal_argv_split(comp_attrib, ',');
for (i=0; NULL != comps[i]; i++) {
if (0 == strcasecmp(comps[i], "TCP")) {
/* we are a candidate */
opal_argv_free(comps);
md = make_module();
free(comp_attrib);
comp_attrib = NULL;
orte_get_attribute(attributes, ORTE_RML_ROUTED_ATTRIB, (void**)&comp_attrib, OPAL_STRING);
/* the routed system understands a NULL request, so no need to check
* return status/value here */
md->routed = orte_routed.assign_module(comp_attrib);
if (NULL != comp_attrib) {
free(comp_attrib);
}
return md;
}
}
/* we are not a candidate */
opal_argv_free(comps);
free(comp_attrib);
return NULL;
}
@ -207,6 +276,12 @@ static void close_conduit(orte_rml_base_module_t *md)
/* cleanup the list of messages */
OBJ_DESTRUCT(&mod->queued_routing_messages);
/* clear the storage */
if (NULL != mod->routed) {
free(mod->routed);
mod->routed = NULL;
}
/* the rml_base_stub takes care of clearing the base receive
* and free'ng the module */
return;

Просмотреть файл

@ -197,6 +197,8 @@ static void send_msg(int fd, short args, void *cbdata)
snd->cbfunc.buffer = req->send.cbfunc.buffer;
}
snd->cbdata = req->send.cbdata;
snd->routed = strdup(req->send.routed);
/* activate the OOB send state */
ORTE_OOB_SEND(snd);
@ -239,6 +241,7 @@ int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod,
req->send.tag = tag;
req->send.cbfunc.iov = cbfunc;
req->send.cbdata = cbdata;
req->send.routed = strdup(mod->routed);
/* setup the event for the send callback */
opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req);
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);
@ -281,6 +284,7 @@ int orte_rml_oob_send_buffer_nb(struct orte_rml_base_module_t *mod,
req->send.tag = tag;
req->send.cbfunc.buffer = cbfunc;
req->send.cbdata = cbdata;
req->send.routed = strdup(mod->routed);
/* setup the event for the send callback */
opal_event_set(orte_event_base, &req->ev, -1, OPAL_EV_WRITE, send_msg, req);
opal_event_set_priority(&req->ev, ORTE_MSG_PRI);

Просмотреть файл

@ -47,6 +47,7 @@
#include "orte/mca/mca.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml_types.h"
@ -253,6 +254,8 @@ typedef void (*orte_rml_module_purge_fn_t)(orte_process_name_t *peer);
typedef struct orte_rml_base_module_t {
/* pointer to the parent component for this module */
struct orte_rml_component_t *component;
/* the routed module to be used */
char *routed;
/** Ping process for connectivity check */
orte_rml_module_ping_fn_t ping;
@ -289,6 +292,9 @@ typedef void (*orte_rml_API_close_conduit_fn_t)(orte_rml_conduit_t id);
*/
typedef int (*orte_rml_API_query_transports_fn_t)(opal_list_t *transports);
/* query the routed module for a given conduit */
typedef char* (*orte_rml_API_query_routed_fn_t)(orte_rml_conduit_t id);
/**
* Get a "contact info" string for the local process
*
@ -344,27 +350,8 @@ typedef void (*orte_rml_API_set_contact_info_fn_t)(const char *contact_info);
* from the local process
* @retval ORTE_ERROR An unspecified error occurred during the update
*/
typedef int (*orte_rml_API_ping_conduit_fn_t)(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv);
/**
* "Ping" another process to determine availability using the default conduit_id
*
* Ping another process to determine if it is available. This
* function only verifies that the process is alive and will allow a
* connection to the local process. It does *not* qualify as
* establishing communication with the remote process, as required by
* the note for set_contact_info().
*
* @param[in] contact_info The contact info string for the remote process
* @param[in] tv Timeout after which the ping should be failed
*
* @retval ORTE_SUCESS The process is available and will allow connections
* from the local process
* @retval ORTE_ERROR An unspecified error occurred during the update
*/
typedef int (*orte_rml_API_ping_fn_t)(const char* contact_info,
typedef int (*orte_rml_API_ping_fn_t)(orte_rml_conduit_t conduit_id,
const char* contact_info,
const struct timeval* tv);
@ -391,7 +378,8 @@ typedef int (*orte_rml_API_ping_fn_t)(const char* contact_info,
* receiving process is not available
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*orte_rml_API_send_nb_fn_t)(orte_process_name_t* peer,
typedef int (*orte_rml_API_send_nb_fn_t)(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct iovec* msg,
int count,
orte_rml_tag_t tag,
@ -421,73 +409,13 @@ typedef int (*orte_rml_API_send_nb_fn_t)(orte_process_name_t* peer,
* receiving process is not available
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*orte_rml_API_send_buffer_nb_fn_t)(orte_process_name_t* peer,
typedef int (*orte_rml_API_send_buffer_nb_fn_t)(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata);
/**
* Send an iovec non-blocking message
*
* Send an array of iovecs to the specified peer. The call
* will return immediately, although the iovecs may not be modified
* until the completion callback is triggered. The iovecs *may* be
* passed to another call to send_nb before the completion callback is
* triggered. The callback being triggered does not give any
* indication of remote completion.
*
* @param[in] peer Name of receiving process
* @param[in] msg Pointer to an array of iovecs to be sent
* @param[in] count Number of iovecs in array
* @param[in] tag User defined tag for matching send/recv
* @param[in] cbfunc Callback function on message comlpetion
* @param[in] cbdata User data to provide during completion callback
*
* @retval ORTE_SUCCESS The message was successfully started
* @retval ORTE_ERR_BAD_PARAM One of the parameters was invalid
* @retval ORTE_ERR_ADDRESSEE_UNKNOWN Contact information for the
* receiving process is not available
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*orte_rml_API_send_nb_conduit_fn_t)(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct iovec* msg,
int count,
orte_rml_tag_t tag,
orte_rml_callback_fn_t cbfunc,
void* cbdata);
/**
* Send a buffer non-blocking message
*
* Send a buffer to the specified peer. The call
* will return immediately, although the buffer may not be modified
* until the completion callback is triggered. The buffer *may* be
* passed to another call to send_nb before the completion callback is
* triggered. The callback being triggered does not give any
* indication of remote completion.
*
* @param[in] peer Name of receiving process
* @param[in] buffer Pointer to buffer to be sent
* @param[in] tag User defined tag for matching send/recv
* @param[in] cbfunc Callback function on message comlpetion
* @param[in] cbdata User data to provide during completion callback
*
* @retval ORTE_SUCCESS The message was successfully started
* @retval ORTE_ERR_BAD_PARAM One of the parameters was invalid
* @retval ORTE_ERR_ADDRESSEE_UNKNOWN Contact information for the
* receiving process is not available
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*orte_rml_API_send_buffer_nb_conduit_fn_t)(orte_rml_conduit_t conduit_id,
orte_process_name_t* peer,
struct opal_buffer_t* buffer,
orte_rml_tag_t tag,
orte_rml_buffer_callback_fn_t cbfunc,
void* cbdata);
/**
* Purge the RML/OOB of contact info and pending messages
* to/from a specified process. Used when a process aborts
@ -557,20 +485,11 @@ typedef struct {
/** Ping process for connectivity check */
orte_rml_API_ping_fn_t ping;
/** Ping process for connectivity check */
orte_rml_API_ping_conduit_fn_t ping_conduit;
/** Send non-blocking iovec message */
orte_rml_API_send_nb_fn_t send_nb;
orte_rml_API_send_nb_fn_t send_nb;
/** Send non-blocking buffer message */
orte_rml_API_send_buffer_nb_fn_t send_buffer_nb;
/** Send non-blocking iovec message */
orte_rml_API_send_nb_conduit_fn_t send_nb_conduit;
/** Send non-blocking buffer message */
orte_rml_API_send_buffer_nb_conduit_fn_t send_buffer_nb_conduit;
orte_rml_API_send_buffer_nb_fn_t send_buffer_nb;
/** Receive non-blocking iovec message */
orte_rml_API_recv_nb_fn_t recv_nb;
@ -587,6 +506,8 @@ typedef struct {
/** Query information of transport in system */
orte_rml_API_query_transports_fn_t query_transports;
/* get the routed module for a given conduit */
orte_rml_API_query_routed_fn_t get_routed;
} orte_rml_base_API_t;
/** Interface for RML communication */

Просмотреть файл

@ -1,7 +1,7 @@
/*
* Copyright (c) 2007-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -31,10 +31,41 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_routed_base_framework;
/* select a component */
ORTE_DECLSPEC int orte_routed_base_select(void);
typedef struct {
opal_list_item_t super;
int pri;
orte_routed_component_t *component;
orte_routed_module_t *module;
} orte_routed_base_active_t;
OBJ_CLASS_DECLARATION(orte_routed_base_active_t);
ORTE_DECLSPEC extern bool orte_routed_base_wait_sync;
ORTE_DECLSPEC extern opal_pointer_array_t orte_routed_jobfams;
typedef struct {
opal_list_t actives;
} orte_routed_base_t;
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;
/* base API wrapper functions */
ORTE_DECLSPEC char* orte_routed_base_assign_module(char *modules);
ORTE_DECLSPEC int orte_routed_base_delete_route(char *module, orte_process_name_t *proc);
ORTE_DECLSPEC int orte_routed_base_update_route(char *module, orte_process_name_t *target,
orte_process_name_t *route);
ORTE_DECLSPEC orte_process_name_t orte_routed_base_get_route(char *module,
orte_process_name_t *target);
ORTE_DECLSPEC int orte_routed_base_init_routes(char *module,
orte_jobid_t job, opal_buffer_t *ndat);
ORTE_DECLSPEC int orte_routed_base_route_lost(char *module,
const orte_process_name_t *route);
ORTE_DECLSPEC bool orte_routed_base_route_is_defined(char *module,
const orte_process_name_t *target);
ORTE_DECLSPEC void orte_routed_base_update_routing_plan(char *module);
ORTE_DECLSPEC void orte_routed_base_get_routing_list(char *module, opal_list_t *coll);
ORTE_DECLSPEC int orte_routed_base_set_lifeline(char *module, orte_process_name_t *proc);
ORTE_DECLSPEC size_t orte_routed_base_num_routes(char *module);
ORTE_DECLSPEC int orte_routed_base_ft_event(char *module, int state);
/* specialized support functions */
ORTE_DECLSPEC void orte_routed_base_xcast_routing(opal_list_t *coll,
opal_list_t *my_children);

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,6 +25,7 @@
#include "orte/types.h"
#include "opal/dss/dss.h"
#include "opal/util/argv.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
@ -36,6 +37,237 @@
#include "orte/mca/routed/base/base.h"
char* orte_routed_base_assign_module(char *modules)
{
orte_routed_base_active_t *active;
char **desired;
int i;
/* the incoming param contains a comma-delimited, prioritized
* list of desired routing modules. If it is NULL, then we
* simply return the module at the top of our list */
if (NULL == modules) {
active = (orte_routed_base_active_t*)opal_list_get_first(&orte_routed_base.actives);
return active->component->base_version.mca_component_name;
}
/* otherwise, cycle thru the provided list of desired modules
* and pick the highest priority one that matches */
desired = opal_argv_split(modules, ',');
for (i=0; NULL != desired[i]; i++) {
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (0 == strcasecmp(desired[i], active->component->base_version.mca_component_name)) {
opal_argv_free(desired);
return active->component->base_version.mca_component_name;
}
}
}
opal_argv_free(desired);
/* get here if none match */
return NULL;
}
int orte_routed_base_delete_route(char *module, orte_process_name_t *proc)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->delete_route) {
if (ORTE_SUCCESS != (rc = active->module->delete_route(proc))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
int orte_routed_base_update_route(char *module, orte_process_name_t *target,
orte_process_name_t *route)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->update_route) {
if (ORTE_SUCCESS != (rc = active->module->update_route(target, route))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t *target)
{
orte_routed_base_active_t *active;
/* a NULL module corresponds to direct */
if (NULL == module) {
return *target;
}
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->get_route) {
return active->module->get_route(target);
}
return *ORTE_NAME_INVALID;
}
}
return *ORTE_NAME_INVALID;
}
int orte_routed_base_init_routes(char *module,
orte_jobid_t job, opal_buffer_t *ndat)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->init_routes) {
if (ORTE_SUCCESS != (rc = active->module->init_routes(job, ndat))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
int orte_routed_base_route_lost(char *module, const orte_process_name_t *route)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->route_lost) {
if (ORTE_SUCCESS != (rc = active->module->route_lost(route))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
bool orte_routed_base_route_is_defined(char *module, const orte_process_name_t *target)
{
orte_routed_base_active_t *active;
/* a NULL module corresponds to direct */
if (NULL == module) {
return true;
}
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->route_is_defined) {
return active->module->route_is_defined(target);
}
break;
}
}
/* if we didn't find the specified module, or it doesn't have
* the required API, then the route isn't defined */
return false;
}
void orte_routed_base_update_routing_plan(char *module)
{
orte_routed_base_active_t *active;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->update_routing_plan) {
active->module->update_routing_plan();
}
}
}
return;
}
void orte_routed_base_get_routing_list(char *module, opal_list_t *coll)
{
orte_routed_base_active_t *active;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->get_routing_list) {
active->module->get_routing_list(coll);
}
}
}
return;
}
int orte_routed_base_set_lifeline(char *module, orte_process_name_t *proc)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->set_lifeline) {
if (ORTE_SUCCESS != (rc = active->module->set_lifeline(proc))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
size_t orte_routed_base_num_routes(char *module)
{
orte_routed_base_active_t *active;
size_t rc = 0;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->num_routes) {
rc += active->module->num_routes();
}
}
}
return rc;
}
int orte_routed_base_ft_event(char *module, int state)
{
orte_routed_base_active_t *active;
int rc;
OPAL_LIST_FOREACH(active, &orte_routed_base.actives, orte_routed_base_active_t) {
if (NULL == module ||
0 == strcmp(module, active->component->base_version.mca_component_name)) {
if (NULL != active->module->ft_event) {
if (ORTE_SUCCESS != (rc = active->module->ft_event(state))) {
return rc;
}
}
}
}
return ORTE_SUCCESS;
}
void orte_routed_base_xcast_routing(opal_list_t *coll, opal_list_t *my_children)
{
orte_routed_tree_t *child;

Просмотреть файл

@ -10,6 +10,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,12 +23,10 @@
#include "orte/mca/mca.h"
#include "opal/class/opal_bitmap.h"
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_component_repository.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/orte_globals.h"
@ -40,6 +39,114 @@
* component's public mca_base_component_t struct. */
#include "orte/mca/routed/base/static-components.h"
orte_routed_base_t orte_routed_base = {{{0}}};
orte_routed_API_t orte_routed = {
.assign_module = orte_routed_base_assign_module,
.delete_route = orte_routed_base_delete_route,
.update_route = orte_routed_base_update_route,
.get_route = orte_routed_base_get_route,
.init_routes = orte_routed_base_init_routes,
.route_lost = orte_routed_base_route_lost,
.route_is_defined = orte_routed_base_route_is_defined,
.set_lifeline = orte_routed_base_set_lifeline,
.update_routing_plan = orte_routed_base_update_routing_plan,
.get_routing_list = orte_routed_base_get_routing_list,
.num_routes = orte_routed_base_num_routes,
.ft_event = orte_routed_base_ft_event
};
static int orte_routed_base_open(mca_base_open_flag_t flags)
{
/* setup our list of actives */
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
/* Open up all available components */
return mca_base_framework_components_open(&orte_routed_base_framework, flags);
}
static int orte_routed_base_close(void)
{
orte_routed_base_active_t *active;
while (NULL != (active = (orte_routed_base_active_t *)opal_list_remove_first(&orte_routed_base.actives))) {
active->module->finalize();
OBJ_RELEASE(active);
}
OPAL_LIST_DESTRUCT(&orte_routed_base.actives);
return mca_base_framework_components_close(&orte_routed_base_framework, NULL);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, routed, "ORTE Message Routing Subsystem", NULL,
orte_routed_base_open, orte_routed_base_close,
mca_routed_base_static_components, 0);
static bool selected = false;
int orte_routed_base_select(void)
{
mca_base_component_list_item_t *cli=NULL;
orte_routed_component_t *component=NULL;
orte_routed_base_active_t *newmodule, *mod;
mca_base_module_t *module;
bool inserted;
int pri;
if (selected) {
return ORTE_SUCCESS;
}
selected = true;
OPAL_LIST_FOREACH(cli, &orte_routed_base_framework.framework_components, mca_base_component_list_item_t ) {
component = (orte_routed_component_t*) cli->cli_component;
opal_output_verbose(10, orte_routed_base_framework.framework_output,
"orte_routed_base_select: Initializing %s component %s",
component->base_version.mca_type_name,
component->base_version.mca_component_name);
if (ORTE_SUCCESS != component->base_version.mca_query_component(&module, &pri)) {
continue;
}
/* add to the list of available components */
newmodule = OBJ_NEW(orte_routed_base_active_t);
newmodule->pri = pri;
newmodule->component = component;
newmodule->module = (orte_routed_module_t*)module;
if (ORTE_SUCCESS != newmodule->module->initialize()) {
OBJ_RELEASE(newmodule);
continue;
}
/* maintain priority order */
inserted = false;
OPAL_LIST_FOREACH(mod, &orte_routed_base.actives, orte_routed_base_active_t) {
if (newmodule->pri > mod->pri) {
opal_list_insert_pos(&orte_routed_base.actives,
(opal_list_item_t*)mod, &newmodule->super);
inserted = true;
break;
}
}
if (!inserted) {
/* must be lowest priority - add to end */
opal_list_append(&orte_routed_base.actives, &newmodule->super);
}
}
if (4 < opal_output_get_verbosity(orte_routed_base_framework.framework_output)) {
opal_output(0, "%s: Final routed priorities", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
/* show the prioritized list */
OPAL_LIST_FOREACH(mod, &orte_routed_base.actives, orte_routed_base_active_t) {
opal_output(0, "\tComponent: %s Priority: %d", mod->component->base_version.mca_component_name, mod->pri);
}
}
return ORTE_SUCCESS;
}
static void construct(orte_routed_tree_t *rt)
{
rt->vpid = ORTE_VPID_INVALID;
@ -49,154 +156,10 @@ static void destruct(orte_routed_tree_t *rt)
{
OBJ_DESTRUCT(&rt->relatives);
}
OBJ_CLASS_INSTANCE(orte_routed_tree_t, opal_list_item_t,
OBJ_CLASS_INSTANCE(orte_routed_tree_t,
opal_list_item_t,
construct, destruct);
static void jfamconst(orte_routed_jobfam_t *ptr)
{
ptr->route.jobid = ORTE_JOBID_INVALID;
ptr->route.vpid = ORTE_VPID_INVALID;
ptr->hnp_uri = NULL;
}
static void jfamdest(orte_routed_jobfam_t *ptr)
{
if (NULL != ptr->hnp_uri) {
free(ptr->hnp_uri);
}
}
OBJ_CLASS_INSTANCE(orte_routed_jobfam_t, opal_object_t,
jfamconst, jfamdest);
orte_routed_module_t orte_routed = {0};
bool orte_routed_base_wait_sync = false;
opal_pointer_array_t orte_routed_jobfams = {{0}};
static int orte_routed_base_open(mca_base_open_flag_t flags)
{
orte_routed_jobfam_t *jfam;
orte_routed_base_wait_sync = false;
/* Initialize storage of remote hnp uris */
OBJ_CONSTRUCT(&orte_routed_jobfams, opal_pointer_array_t);
opal_pointer_array_init(&orte_routed_jobfams, 8, INT_MAX, 8);
/* prime it with our HNP uri */
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->route.jobid = ORTE_PROC_MY_HNP->jobid;
jfam->route.vpid = ORTE_PROC_MY_HNP->vpid;
jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
if (NULL != orte_process_info.my_hnp_uri) {
jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri);
}
opal_pointer_array_add(&orte_routed_jobfams, jfam);
/* Open up all available components */
return mca_base_framework_components_open(&orte_routed_base_framework, flags);
}
static int orte_routed_base_close(void)
{
int i;
orte_routed_jobfam_t *jfam;
/* finalize the selected component */
if (NULL != orte_routed.finalize) {
orte_routed.finalize();
}
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL != (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
OBJ_RELEASE(jfam);
}
}
OBJ_DESTRUCT(&orte_routed_jobfams);
return mca_base_framework_components_close(&orte_routed_base_framework, NULL);
}
MCA_BASE_FRAMEWORK_DECLARE(orte, routed, "ORTE Message Routing Subsystem", NULL,
orte_routed_base_open, orte_routed_base_close,
mca_routed_base_static_components, 0);
int orte_routed_base_select(void)
{
int ret, exit_status = OPAL_SUCCESS;
orte_routed_component_t *best_component = NULL;
orte_routed_module_t *best_module = NULL;
/*
* Select the best component
*/
if( OPAL_SUCCESS != mca_base_select("routed", orte_routed_base_framework.framework_output,
&orte_routed_base_framework.framework_components,
(mca_base_module_t **) &best_module,
(mca_base_component_t **) &best_component, NULL) ) {
/* This will only happen if no component was selected */
exit_status = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
/* Save the winner */
orte_routed = *best_module;
/* initialize the selected component */
opal_output_verbose(10, orte_routed_base_framework.framework_output,
"orte_routed_base_select: initializing selected component %s",
best_component->base_version.mca_component_name);
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
exit_status = ret;
goto cleanup;
}
cleanup:
return exit_status;
}
void orte_routed_base_update_hnps(opal_buffer_t *buf)
{
int n, rc;
char *uri;
orte_process_name_t name;
orte_routed_jobfam_t *jfam;
uint16_t jobfamily;
n = 1;
while (ORTE_SUCCESS == opal_dss.unpack(buf, &uri, &n, OPAL_STRING)) {
/*extract the name */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(uri, &name, NULL))) {
ORTE_ERROR_LOG(rc);
free(uri);
n=1;
continue;
}
jobfamily = ORTE_JOB_FAMILY(name.jobid);
/* see if we already have this connection */
for (n=0; n < orte_routed_jobfams.size; n++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams,n))) {
continue;
}
if (jobfamily == jfam->job_family) {
/* update uri */
if (NULL != jfam->hnp_uri) {
free(jfam->hnp_uri);
}
jfam->hnp_uri = strdup(uri);
OPAL_OUTPUT_VERBOSE((10, orte_routed_base_framework.framework_output,
"%s adding remote HNP %s\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&name), uri));
goto done;
}
}
/* nope - create it */
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->job_family = jobfamily;
jfam->route.jobid = name.jobid;
jfam->route.vpid = name.vpid;
jfam->hnp_uri = strdup(uri);
done:
free(uri);
n=1;
}
}
OBJ_CLASS_INSTANCE(orte_routed_base_active_t,
opal_list_item_t,
NULL, NULL);

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -51,7 +51,6 @@ static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static void update_routing_plan(void);
static void get_routing_list(opal_list_t *coll);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
static size_t num_routes(void);
@ -60,21 +59,20 @@ static int binomial_ft_event(int state);
#endif
orte_routed_module_t orte_routed_binomial_module = {
init,
finalize,
delete_route,
update_route,
get_route,
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_plan,
get_routing_list,
get_wireup_info,
num_routes,
.initialize = init,
.finalize = finalize,
.delete_route = delete_route,
.update_route = update_route,
.get_route = get_route,
.init_routes = init_routes,
.route_lost = route_lost,
.route_is_defined = route_is_defined,
.set_lifeline = set_lifeline,
.update_routing_plan = update_routing_plan,
.get_routing_list = get_routing_list,
.num_routes = num_routes,
#if OPAL_ENABLE_FT_CR == 1
binomial_ft_event
.ft_event = binomial_ft_event
#else
NULL
#endif
@ -117,10 +115,6 @@ static int finalize(void)
static int delete_route(orte_process_name_t *proc)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -140,40 +134,6 @@ static int delete_route(orte_process_name_t *proc)
ORTE_NAME_PRINT(proc)));
/* if this is from a different job family, then I need to
* look it up appropriately
*/
if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so I have nothing
* in my routing table and thus have nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
/* see if this job family is present */
jfamily = ORTE_JOB_FAMILY(proc->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: deleting route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(proc->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
return ORTE_SUCCESS;
}
}
/* not present - nothing to do */
return ORTE_SUCCESS;
}
/* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing
* to do here. The routes will be redefined when we update
* the routing tree
@ -185,10 +145,6 @@ static int delete_route(orte_process_name_t *proc)
static int update_route(orte_process_name_t *target,
orte_process_name_t *route)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -218,58 +174,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_binomial_update: diff job family routing job %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
/* see if this target is already present */
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: updating route to %s via %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
return ORTE_SUCCESS;
}
}
/* not there, so add the route FOR THE JOB FAMILY*/
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: adding route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
@ -279,9 +183,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
orte_process_name_t *ret, daemon;
opal_list_item_t *item;
orte_routed_tree_t *child;
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (!orte_routing_is_enabled) {
ret = target;
@ -325,38 +226,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/****** HNP AND DAEMONS ONLY ******/
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
if (ORTE_PROC_IS_DAEMON) {
ret = ORTE_PROC_MY_HNP;
goto found;
}
/* if I am the HNP or a tool, then I stored a route to
* this job family, so look it up
*/
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: route to %s found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
ret = &jfam->route;
goto found;
}
}
/* not found - so we have no route */
ret = ORTE_NAME_INVALID;
goto found;
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (!hnp_direct || orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
@ -426,16 +295,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
return *ret;
}
static void recv_ack(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
bool *ack_waiting = (bool*)cbdata;
/* flag as complete */
*ack_waiting = false;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
/* the binomial module routes all proc communications through
@ -448,7 +307,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
* for each proc
*/
int rc;
bool ack_waiting;
/* if I am a tool, then I stand alone - there is nothing to do */
if (ORTE_PROC_IS_TOOL) {
@ -548,74 +406,7 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
}
{ /* MUST BE A PROC */
/* if ndat != NULL, then this is being invoked by the proc to
* init a route to a specified process that is outside of our
* job family. We want that route to go through our HNP, routed via
* out local daemon - however, we cannot know for
* certain that the HNP already knows how to talk to the specified
* procs. For example, in OMPI's publish/subscribe procedures, the
* DPM framework looks for an mca param containing the global ompi-server's
* uri. This info will come here so the proc can setup a route to
* the server - we need to pass the routing info to our HNP
*/
if (NULL != ndat) {
int rc;
opal_buffer_t *xfer;
orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD;
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_binomial: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
* start by sending the contact info to the HNP for update
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_binomial_init_routes: diff job family %s - sending update to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
/* prep the buffer for transmission to the HNP */
xfer = OBJ_NEW(opal_buffer_t);
opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD);
opal_dss.copy_payload(xfer, ndat);
/* save any new connections for use in subsequent connect_accept calls */
orte_routed_base_update_hnps(ndat);
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer,
ORTE_RML_TAG_RML_INFO_UPDATE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(xfer);
return rc;
}
/* wait right here until the HNP acks the update to ensure that
* any subsequent messaging can succeed
*/
ack_waiting = true;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_UPDATE_ROUTE_ACK,
ORTE_RML_NON_PERSISTENT,
recv_ack, &ack_waiting);
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_binomial_init_routes: ack recvd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* our get_route function automatically routes all messages for
* other job families via the HNP, so nothing more to do here
*/
}
return ORTE_SUCCESS;
}
/* if ndat=NULL, then we are being called during orte_init. In this
/* if ndat=NULL, then we are being called during orte_init. In this
* case, we need to setup a few critical pieces of info
*/
@ -682,35 +473,12 @@ static int route_lost(const orte_process_name_t *route)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
int i;
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
/* if the route is to a different job family and we are the HNP, look it up */
if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(route->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(route->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
break;
}
}
}
/* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort.
* NOTE: we cannot call abort from here as the OOB needs to first
@ -754,34 +522,6 @@ static int route_lost(const orte_process_name_t *route)
static bool route_is_defined(const orte_process_name_t *target)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
/* if the route is to a different job family and we are the HNP, look it up */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
if (ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: route to %s is defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
return true;
}
}
return false;
}
/* if we are not the HNP, then the answer is always true as
* we send it via the HNP
*/
return true;
}
/* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
return false;
@ -940,42 +680,6 @@ static void get_routing_list(opal_list_t *coll)
orte_routed_base_xcast_routing(coll, &my_children);
}
static int get_wireup_info(opal_buffer_t *buf)
{
int rc;
int i;
orte_routed_jobfam_t *jfam;
if (ORTE_PROC_IS_HNP) {
/* if we are not using static ports, then we need to share the
* comm info - otherwise, just return
*/
if (orte_static_ports) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if I am an application, this is occurring during connect_accept.
* We need to return the stored information of other HNPs we
* know about, if any
*/
if (ORTE_PROC_IS_APP) {
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL != (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
opal_dss.pack(buf, &(jfam->hnp_uri), 1, OPAL_STRING);
}
}
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
static size_t num_routes(void)
{
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,

Просмотреть файл

@ -4,6 +4,7 @@
* reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,7 +30,7 @@ orte_routed_component_t mca_routed_binomial_component = {
information about the component itself */
.base_version = {
ORTE_ROUTED_BASE_VERSION_2_0_0,
ORTE_ROUTED_BASE_VERSION_3_0_0,
.mca_component_name = "binomial",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -50,7 +50,6 @@ static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static void update_routing_plan(void);
static void get_routing_list(opal_list_t *coll);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
static size_t num_routes(void);
@ -59,21 +58,20 @@ static int debruijn_ft_event(int state);
#endif
orte_routed_module_t orte_routed_debruijn_module = {
init,
finalize,
delete_route,
update_route,
get_route,
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_plan,
get_routing_list,
get_wireup_info,
num_routes,
.initialize = init,
.finalize = finalize,
.delete_route = delete_route,
.update_route = update_route,
.get_route = get_route,
.init_routes = init_routes,
.route_lost = route_lost,
.route_is_defined = route_is_defined,
.set_lifeline = set_lifeline,
.update_routing_plan = update_routing_plan,
.get_routing_list = get_routing_list,
.num_routes = num_routes,
#if OPAL_ENABLE_FT_CR == 1
debruijn_ft_event
.ft_event = debruijn_ft_event
#else
NULL
#endif
@ -116,10 +114,6 @@ static int finalize(void)
static int delete_route(orte_process_name_t *proc)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -138,41 +132,6 @@ static int delete_route(orte_process_name_t *proc)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* if this is from a different job family, then I need to
* look it up appropriately
*/
if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so I have nothing
* in my routing table and thus have nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
/* see if this job family is present */
jfamily = ORTE_JOB_FAMILY(proc->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_debruijn: deleting route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(proc->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
return ORTE_SUCCESS;
}
}
/* not present - nothing to do */
return ORTE_SUCCESS;
}
/* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing
* to do here. The routes will be redefined when we update
* the routing tree
@ -184,10 +143,6 @@ static int delete_route(orte_process_name_t *proc)
static int update_route(orte_process_name_t *target,
orte_process_name_t *route)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -217,56 +172,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_debruijn_update: diff job family routing job %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
/* see if this target is already present */
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_debruijn: updating route to %s via %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
return ORTE_SUCCESS;
}
}
/* not there, so add the route FOR THE JOB FAMILY*/
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_debruijn: adding route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
@ -295,10 +200,7 @@ static inline unsigned int debruijn_next_hop (int target)
static orte_process_name_t get_route(orte_process_name_t *target)
{
orte_routed_jobfam_t *jfam;
orte_process_name_t ret;
uint16_t jfamily;
int i;
/* initialize */
@ -337,35 +239,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
/****** HNP AND DAEMONS ONLY ******/
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
if (ORTE_PROC_IS_DAEMON) {
ret = *ORTE_PROC_MY_HNP;
break;
}
/* if I am the HNP or a tool, then I stored a route to
* this job family, so look it up
*/
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i = 0 ; i < orte_routed_jobfams.size ; ++i) {
if (NULL ==
(jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
ret = jfam->route;
break;
}
}
/* not found - so we have no route */
break;
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
if (!hnp_direct || orte_static_ports) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
@ -410,16 +283,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
return ret;
}
static void recv_ack(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
bool *ack_waiting = (bool*)cbdata;
/* flag as complete */
*ack_waiting = false;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
/* the debruijn module routes all proc communications through
@ -531,73 +394,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
}
{ /* MUST BE A PROC */
/* if ndat != NULL, then this is being invoked by the proc to
* init a route to a specified process that is outside of our
* job family. We want that route to go through our HNP, routed via
* out local daemon - however, we cannot know for
* certain that the HNP already knows how to talk to the specified
* procs. For example, in OMPI's publish/subscribe procedures, the
* DPM framework looks for an mca param containing the global ompi-server's
* uri. This info will come here so the proc can setup a route to
* the server - we need to pass the routing info to our HNP
*/
if (NULL != ndat) {
int rc;
opal_buffer_t *xfer;
orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD;
bool ack_waiting;
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_debruijn: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
* start by sending the contact info to the HNP for update
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_debruijn_init_routes: diff job family - sending update to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
/* prep the buffer for transmission to the HNP */
xfer = OBJ_NEW(opal_buffer_t);
opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD);
opal_dss.copy_payload(xfer, ndat);
/* save any new connections for use in subsequent connect_accept calls */
orte_routed_base_update_hnps(ndat);
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer,
ORTE_RML_TAG_RML_INFO_UPDATE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(xfer);
return rc;
}
/* wait right here until the HNP acks the update to ensure that
* any subsequent messaging can succeed
*/
ack_waiting = true;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_UPDATE_ROUTE_ACK,
ORTE_RML_NON_PERSISTENT,
recv_ack, &ack_waiting);
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_debruijn_init_routes: ack recvd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* our get_route function automatically routes all messages for
* other job families via the HNP, so nothing more to do here
*/
}
return ORTE_SUCCESS;
}
/* if ndat=NULL, then we are being called during orte_init. In this
* case, we need to setup a few critical pieces of info
*/
@ -656,35 +452,12 @@ static int route_lost(const orte_process_name_t *route)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
int i;
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
/* if the route is to a different job family and we are the HNP, look it up */
if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(route->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_debruijn: route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(route->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
break;
}
}
}
/* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort.
* NOTE: we cannot call abort from here as the OOB needs to first
@ -723,34 +496,6 @@ static int route_lost(const orte_process_name_t *route)
static bool route_is_defined(const orte_process_name_t *target)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
/* if the route is to a different job family and we are the HNP, look it up */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
if (ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_debruijn: route to %s is defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
return true;
}
}
return false;
}
/* if we are not the HNP, then the answer is always true as
* we send it via the HNP
*/
return true;
}
/* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
return false;
@ -854,42 +599,6 @@ static void get_routing_list(opal_list_t *coll)
orte_routed_base_xcast_routing(coll, &my_children);
}
static int get_wireup_info(opal_buffer_t *buf)
{
int rc;
int i;
orte_routed_jobfam_t *jfam;
if (ORTE_PROC_IS_HNP) {
/* if we are not using static ports, then we need to share the
* comm info - otherwise, just return
*/
if (orte_static_ports) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if I am an application, this is occurring during connect_accept.
* We need to return the stored information of other HNPs we
* know about, if any
*/
if (ORTE_PROC_IS_APP) {
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL != (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
opal_dss.pack(buf, &(jfam->hnp_uri), 1, OPAL_STRING);
}
}
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
static size_t num_routes(void)
{
return opal_list_get_size(&my_children);

Просмотреть файл

@ -4,6 +4,7 @@
* All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,7 +30,7 @@ orte_routed_component_t mca_routed_debruijn_component = {
information about the component itself */
.base_version = {
ORTE_ROUTED_BASE_VERSION_2_0_0,
ORTE_ROUTED_BASE_VERSION_3_0_0,
.mca_component_name = "debruijn",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,

Просмотреть файл

@ -4,7 +4,7 @@
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,7 +42,6 @@ static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static void update_routing_plan(void);
static void get_routing_list(opal_list_t *coll);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
static size_t num_routes(void);
@ -51,21 +50,20 @@ static int direct_ft_event(int state);
#endif
orte_routed_module_t orte_routed_direct_module = {
init,
finalize,
delete_route,
update_route,
get_route,
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_plan,
get_routing_list,
get_wireup_info,
num_routes,
.initialize = init,
.finalize = finalize,
.delete_route = delete_route,
.update_route = update_route,
.get_route = get_route,
.init_routes = init_routes,
.route_lost = route_lost,
.route_is_defined = route_is_defined,
.set_lifeline = set_lifeline,
.update_routing_plan = update_routing_plan,
.get_routing_list = get_routing_list,
.num_routes = num_routes,
#if OPAL_ENABLE_FT_CR == 1
direct_ft_event
.ft_event = direct_ft_event
#else
NULL
#endif
@ -117,9 +115,6 @@ static int update_route(orte_process_name_t *target,
static orte_process_name_t get_route(orte_process_name_t *target)
{
orte_process_name_t *ret, daemon;
orte_routed_jobfam_t *jfam;
int i;
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
@ -160,37 +155,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/****** HNP AND DAEMONS ONLY ******/
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
if (ORTE_PROC_IS_DAEMON) {
ret = ORTE_PROC_MY_HNP;
goto found;
}
/* if I am the HNP, then I stored a route to
* this job family, so look it up
*/
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_direct: route to %s found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
ret = &jfam->route;
goto found;
}
}
/* not found - so we have no route */
ret = ORTE_NAME_INVALID;
goto found;
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routing direct to the HNP",
@ -226,16 +190,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
static void recv_ack(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
bool *ack_waiting = (bool*)cbdata;
/* flag as complete */
*ack_waiting = false;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
int rc;
@ -323,112 +277,31 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
}
/*** MUST BE A PROC ***/
if (NULL == ndat) {
/* if we were direct launched, there is nothing we need to do. If we
* were launched by mpirun, then we need to set the HNP and daemon info */
if (NULL != orte_process_info.my_hnp_uri) {
/* extract the hnp name and store it */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
ORTE_PROC_MY_HNP, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* we don't set the HNP's contact info as we don't need it - we
* only contact our local daemon, which might be the HNP (in which
* case it will have also been passed as our daemon uri) */
}
if (NULL != orte_process_info.my_daemon_uri) {
/* extract the daemon's name so we can update the routing table */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
ORTE_PROC_MY_DAEMON, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_rml.set_contact_info(orte_process_info.my_daemon_uri);
/* my daemon is my lifeline */
lifeline = ORTE_PROC_MY_DAEMON;
}
return ORTE_SUCCESS;
}
/* if ndat != NULL, then this is being invoked by the proc to
* init a route to a specified process that is outside of our
* job family. We want that route to go through our HNP, routed via
* out local daemon - however, we cannot know for
* certain that the HNP already knows how to talk to the specified
* procs. For example, in OMPI's publish/subscribe procedures, the
* DPM framework looks for an mca param containing the global ompi-server's
* uri. This info will come here so the proc can setup a route to
* the server - we need to pass the routing info to our HNP.
*
* Obviously, if we were direct launched, we won't have an HNP, in
* which case we just update our own contact info and go direct
*/
if (NULL == orte_process_info.my_hnp_uri) {
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_direct: init routes w/non-NULL data and direct launched",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) {
/* if we were direct launched, there is nothing we need to do. If we
* were launched by mpirun, then we need to set the HNP and daemon info */
if (NULL != orte_process_info.my_hnp_uri) {
/* extract the hnp name and store it */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
ORTE_PROC_MY_HNP, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
opal_buffer_t *xfer;
orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD;
bool ack_waiting;
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_direct: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
* start by sending the contact info to the HNP for update
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_direct_init_routes: diff job family - sending update to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
/* prep the buffer for transmission to the HNP */
xfer = OBJ_NEW(opal_buffer_t);
opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD);
opal_dss.copy_payload(xfer, ndat);
/* save any new connections for use in subsequent connect_accept calls */
orte_routed_base_update_hnps(ndat);
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer,
ORTE_RML_TAG_RML_INFO_UPDATE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(xfer);
return rc;
}
/* wait right here until the HNP acks the update to ensure that
* any subsequent messaging can succeed
*/
ack_waiting = true;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_UPDATE_ROUTE_ACK,
ORTE_RML_NON_PERSISTENT,
recv_ack, &ack_waiting);
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_direct_init_routes: ack recvd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* our get_route function automatically routes all messages for
* other job families via the HNP, so nothing more to do here
*/
}
/* we don't set the HNP's contact info as we don't need it - we
* only contact our local daemon, which might be the HNP (in which
* case it will have also been passed as our daemon uri) */
}
if (NULL != orte_process_info.my_daemon_uri) {
/* extract the daemon's name so we can update the routing table */
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
ORTE_PROC_MY_DAEMON, NULL))) {
ORTE_ERROR_LOG(rc);
return rc;
}
orte_rml.set_contact_info(orte_process_info.my_daemon_uri);
/* my daemon is my lifeline */
lifeline = ORTE_PROC_MY_DAEMON;
}
return ORTE_SUCCESS;
}
@ -436,35 +309,12 @@ static int route_lost(const orte_process_name_t *route)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
int i;
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
/* if the route is to a different job family and we are the HNP, look it up */
if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(route->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_direct: route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(route->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
break;
}
}
}
/* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort.
* NOTE: we cannot call abort from here as the OOB needs to first
@ -573,19 +423,6 @@ static void get_routing_list(opal_list_t *coll)
orte_routed_base_xcast_routing(coll, &my_children);
}
static int get_wireup_info(opal_buffer_t *buf)
{
int rc;
if (ORTE_PROC_IS_HNP) {
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
return ORTE_SUCCESS;
}
static size_t num_routes(void)
{
if (!ORTE_PROC_IS_HNP) {

Просмотреть файл

@ -4,7 +4,7 @@
* All rights reserved.
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -33,7 +33,7 @@ orte_routed_component_t mca_routed_direct_component = {
information about the component itself */
.base_version = {
ORTE_ROUTED_BASE_VERSION_2_0_0,
ORTE_ROUTED_BASE_VERSION_3_0_0,
.mca_component_name = "direct",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,

Просмотреть файл

@ -6,7 +6,7 @@
* reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -52,7 +52,6 @@ static int route_lost(const orte_process_name_t *route);
static bool route_is_defined(const orte_process_name_t *target);
static void update_routing_plan(void);
static void get_routing_list(opal_list_t *coll);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
static size_t num_routes(void);
@ -61,21 +60,20 @@ static int radix_ft_event(int state);
#endif
orte_routed_module_t orte_routed_radix_module = {
init,
finalize,
delete_route,
update_route,
get_route,
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_plan,
get_routing_list,
get_wireup_info,
num_routes,
.initialize = init,
.finalize = finalize,
.delete_route = delete_route,
.update_route = update_route,
.get_route = get_route,
.init_routes = init_routes,
.route_lost = route_lost,
.route_is_defined = route_is_defined,
.set_lifeline = set_lifeline,
.update_routing_plan = update_routing_plan,
.get_routing_list = get_routing_list,
.num_routes = num_routes,
#if OPAL_ENABLE_FT_CR == 1
radix_ft_event
.ft_event = radix_ft_event
#else
NULL
#endif
@ -118,10 +116,6 @@ static int finalize(void)
static int delete_route(orte_process_name_t *proc)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (proc->jobid == ORTE_JOBID_INVALID ||
proc->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -141,40 +135,6 @@ static int delete_route(orte_process_name_t *proc)
ORTE_NAME_PRINT(proc)));
/* if this is from a different job family, then I need to
* look it up appropriately
*/
if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so I have nothing
* in my routing table and thus have nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
/* see if this job family is present */
jfamily = ORTE_JOB_FAMILY(proc->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: deleting route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(proc->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
return ORTE_SUCCESS;
}
}
/* not present - nothing to do */
return ORTE_SUCCESS;
}
/* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing
* to do here. The routes will be redefined when we update
* the routing tree
@ -186,10 +146,6 @@ static int delete_route(orte_process_name_t *proc)
static int update_route(orte_process_name_t *target,
orte_process_name_t *route)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (target->jobid == ORTE_JOBID_INVALID ||
target->vpid == ORTE_VPID_INVALID) {
return ORTE_ERR_BAD_PARAM;
@ -219,56 +175,6 @@ static int update_route(orte_process_name_t *target,
return ORTE_SUCCESS;
}
/* if this is from a different job family, then I need to
* track how to send messages to it
*/
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, then I will automatically route
* anything to this job family via my HNP - so nothing to do
* here, just return
*/
if (ORTE_PROC_IS_DAEMON) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_radix_update: diff job family routing job %s --> %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
/* see if this target is already present */
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_radix: updating route to %s via %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid),
ORTE_NAME_PRINT(route)));
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
return ORTE_SUCCESS;
}
}
/* not there, so add the route FOR THE JOB FAMILY*/
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_radix: adding route to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
jfam = OBJ_NEW(orte_routed_jobfam_t);
jfam->job_family = jfamily;
jfam->route.jobid = route->jobid;
jfam->route.vpid = route->vpid;
opal_pointer_array_add(&orte_routed_jobfams, jfam);
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
@ -278,9 +184,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
orte_process_name_t *ret, daemon;
opal_list_item_t *item;
orte_routed_tree_t *child;
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
if (!orte_routing_is_enabled) {
ret = target;
@ -324,39 +227,6 @@ static orte_process_name_t get_route(orte_process_name_t *target)
}
/****** HNP AND DAEMONS ONLY ******/
/* IF THIS IS FOR A DIFFERENT JOB FAMILY... */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
/* if I am a daemon, route this via the HNP */
if (ORTE_PROC_IS_DAEMON) {
ret = ORTE_PROC_MY_HNP;
goto found;
}
/* if I am the HNP or a tool, then I stored a route to
* this job family, so look it up
*/
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_binomial: route to %s found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
ret = &jfam->route;
goto found;
}
}
/* not found - so we have no route */
ret = ORTE_NAME_INVALID;
goto found;
}
/* THIS CAME FROM OUR OWN JOB FAMILY... */
/* if this is going to the HNP, then send it direct if we don't know
* how to get there - otherwise, send it via the tree
*/
@ -431,16 +301,6 @@ found:
return *ret;
}
static void recv_ack(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag, void *cbdata)
{
bool *ack_waiting = (bool*)cbdata;
/* flag as complete */
*ack_waiting = false;
}
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
{
/* the radix module routes all proc communications through
@ -558,73 +418,6 @@ static int init_routes(orte_jobid_t job, opal_buffer_t *ndat)
return ORTE_SUCCESS;
}
/* if ndat != NULL, then this is being invoked by the proc to
* init a route to a specified process that is outside of our
* job family. We want that route to go through our HNP, routed via
* out local daemon - however, we cannot know for
* certain that the HNP already knows how to talk to the specified
* procs. For example, in OMPI's publish/subscribe procedures, the
* DPM framework looks for an mca param containing the global ompi-server's
* uri. This info will come here so the proc can setup a route to
* the server - we need to pass the routing info to our HNP
*/
if (NULL != ndat) {
int rc;
opal_buffer_t *xfer;
orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD;
bool ack_waiting;
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_radix: init routes w/non-NULL data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) {
/* if this is for a different job family, then we route via our HNP
* to minimize connection counts to entities such as ompi-server, so
* start by sending the contact info to the HNP for update
*/
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_radix_init_routes: diff job family - sending update to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(ORTE_PROC_MY_HNP)));
/* prep the buffer for transmission to the HNP */
xfer = OBJ_NEW(opal_buffer_t);
opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD);
opal_dss.copy_payload(xfer, ndat);
/* save any new connections for use in subsequent connect_accept calls */
orte_routed_base_update_hnps(ndat);
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer,
ORTE_RML_TAG_RML_INFO_UPDATE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(xfer);
return rc;
}
/* wait right here until the HNP acks the update to ensure that
* any subsequent messaging can succeed
*/
ack_waiting = true;
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
ORTE_RML_TAG_UPDATE_ROUTE_ACK,
ORTE_RML_NON_PERSISTENT,
recv_ack, &ack_waiting);
ORTE_WAIT_FOR_COMPLETION(ack_waiting);
OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
"%s routed_radix_init_routes: ack recvd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* our get_route function automatically routes all messages for
* other job families via the HNP, so nothing more to do here
*/
}
return ORTE_SUCCESS;
}
/* if ndat=NULL, then we are being called during orte_init. In this
* case, we need to setup a few critical pieces of info
*/
@ -683,35 +476,12 @@ static int route_lost(const orte_process_name_t *route)
{
opal_list_item_t *item;
orte_routed_tree_t *child;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
int i;
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
/* if the route is to a different job family and we are the HNP, look it up */
if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) &&
ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(route->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_radix: route to %s lost",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(route->jobid)));
opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL);
OBJ_RELEASE(jfam);
break;
}
}
}
/* if we lose the connection to the lifeline and we are NOT already,
* in finalize, tell the OOB to abort.
* NOTE: we cannot call abort from here as the OOB needs to first
@ -750,34 +520,6 @@ static int route_lost(const orte_process_name_t *route)
static bool route_is_defined(const orte_process_name_t *target)
{
int i;
orte_routed_jobfam_t *jfam;
uint16_t jfamily;
/* if the route is to a different job family and we are the HNP, look it up */
if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
if (ORTE_PROC_IS_HNP) {
jfamily = ORTE_JOB_FAMILY(target->jobid);
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
continue;
}
if (jfam->job_family == jfamily) {
OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
"%s routed_radix: route to %s is defined",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOB_FAMILY_PRINT(target->jobid)));
return true;
}
}
return false;
}
/* if we are not the HNP, then the answer is always true as
* we send it via the HNP
*/
return true;
}
/* find out what daemon hosts this proc */
if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) {
return false;
@ -920,42 +662,6 @@ static void get_routing_list(opal_list_t *coll)
orte_routed_base_xcast_routing(coll, &my_children);
}
static int get_wireup_info(opal_buffer_t *buf)
{
int rc;
int i;
orte_routed_jobfam_t *jfam;
if (ORTE_PROC_IS_HNP) {
/* if we are not using static ports, then we need to share the
* comm info - otherwise, just return
*/
if (orte_static_ports) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, buf))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if I am an application, this is occurring during connect_accept.
* We need to return the stored information of other HNPs we
* know about, if any
*/
if (ORTE_PROC_IS_APP) {
for (i=0; i < orte_routed_jobfams.size; i++) {
if (NULL != (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) {
opal_dss.pack(buf, &(jfam->hnp_uri), 1, OPAL_STRING);
}
}
return ORTE_SUCCESS;
}
return ORTE_SUCCESS;
}
static size_t num_routes(void)
{
return opal_list_get_size(&my_children);

Просмотреть файл

@ -5,7 +5,7 @@
* Copyright (c) 2004-2008 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -33,7 +33,7 @@ orte_routed_radix_component_t mca_routed_radix_component = {
information about the component itself */
.base_version = {
ORTE_ROUTED_BASE_VERSION_2_0_0,
ORTE_ROUTED_BASE_VERSION_3_0_0,
.mca_component_name = "radix",
MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION,

Просмотреть файл

@ -7,7 +7,7 @@
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -54,25 +54,6 @@ struct opal_buffer_t;
struct orte_rml_module_t;
/* ******************************************************************** */
/**
* routed component interface
*
* Component interface for the routed framework. A public instance of
* this structure, called mca_routed_[component name]_component, must
* exist in any routed component.
*/
struct orte_routed_component_2_0_0_t {
/* Base component description */
mca_base_component_t base_version;
/* Base component data block */
mca_base_component_data_t base_data;
};
/** Convienence typedef */
typedef struct orte_routed_component_2_0_0_t orte_routed_component_t;
/* ******************************************************************** */
/**
* Initialize the routed module
@ -180,14 +161,6 @@ typedef int (*orte_routed_module_route_lost_fn_t)(const orte_process_name_t *rou
*/
typedef bool (*orte_routed_module_route_is_defined_fn_t)(const orte_process_name_t *target);
/**
* Get wireup data for daemons
*
* Add whatever routing data
* this module requires to allow inter-process messaging.
*/
typedef int (*orte_routed_module_get_wireup_info_fn_t)(opal_buffer_t *buf);
/*
* Update the module's routing plan
*
@ -242,7 +215,7 @@ typedef int (*orte_routed_module_ft_event_fn_t)(int state);
* instance of this module, orte_routed, provices an interface into the
* active routed interface.
*/
struct orte_routed_module_t {
typedef struct {
/** Startup/shutdown the communication system and clean up resources */
orte_routed_module_init_fn_t initialize;
orte_routed_module_finalize_fn_t finalize;
@ -257,24 +230,84 @@ struct orte_routed_module_t {
/* fns for daemons */
orte_routed_module_update_routing_plan_fn_t update_routing_plan;
orte_routed_module_get_routing_list_fn_t get_routing_list;
orte_routed_module_get_wireup_info_fn_t get_wireup_info;
orte_routed_module_num_routes_fn_t num_routes;
/* FT Notification */
orte_routed_module_ft_event_fn_t ft_event;
};
/** Convenience typedef */
typedef struct orte_routed_module_t orte_routed_module_t;
} orte_routed_module_t;
/** Interface for routed communication */
ORTE_DECLSPEC extern orte_routed_module_t orte_routed;
/* define an equivalent set of API functions - these will be implemented
* as "stubs" in the framework base */
typedef char* (*orte_routed_API_assign_module_fn_t)(char *modules);
typedef int (*orte_routed_API_delete_route_fn_t)(char *module,
orte_process_name_t *proc);
typedef int (*orte_routed_API_update_route_fn_t)(char *module,
orte_process_name_t *target,
orte_process_name_t *route);
typedef orte_process_name_t (*orte_routed_API_get_route_fn_t)(char *module,
orte_process_name_t *target);
typedef int (*orte_routed_API_init_routes_fn_t)(char *module,
orte_jobid_t job, opal_buffer_t *ndat);
typedef int (*orte_routed_API_route_lost_fn_t)(char *module,
const orte_process_name_t *route);
typedef bool (*orte_routed_API_route_is_defined_fn_t)(char *module,
const orte_process_name_t *target);
typedef void (*orte_routed_API_update_routing_plan_fn_t)(char *module);
typedef void (*orte_routed_API_get_routing_list_fn_t)(char *module, opal_list_t *coll);
typedef int (*orte_routed_API_set_lifeline_fn_t)(char *module, orte_process_name_t *proc);
typedef size_t (*orte_routed_API_num_routes_fn_t)(char *module);
typedef int (*orte_routed_API_ft_event_fn_t)(char *module, int state);
typedef struct {
/* API functions */
orte_routed_API_assign_module_fn_t assign_module;
orte_routed_API_delete_route_fn_t delete_route;
orte_routed_API_update_route_fn_t update_route;
orte_routed_API_get_route_fn_t get_route;
orte_routed_API_init_routes_fn_t init_routes;
orte_routed_API_route_lost_fn_t route_lost;
orte_routed_API_route_is_defined_fn_t route_is_defined;
orte_routed_API_set_lifeline_fn_t set_lifeline;
/* fns for daemons */
orte_routed_API_update_routing_plan_fn_t update_routing_plan;
orte_routed_API_get_routing_list_fn_t get_routing_list;
orte_routed_API_num_routes_fn_t num_routes;
/* FT Notification */
orte_routed_API_ft_event_fn_t ft_event;
} orte_routed_API_t;
/* provide an interface to the routed framework stub functions */
ORTE_DECLSPEC extern orte_routed_API_t orte_routed;
/* ******************************************************************** */
/**
* routed component interface
*
* Component interface for the routed framework. A public instance of
* this structure, called mca_routed_[component name]_component, must
* exist in any routed component.
*/
struct orte_routed_component_3_0_0_t {
/* Base component description */
mca_base_component_t base_version;
/* Base component data block */
mca_base_component_data_t base_data;
/* priority */
int priority;
};
/** Convienence typedef */
typedef struct orte_routed_component_3_0_0_t orte_routed_component_t;
/* ******************************************************************** */
/** Macro for use in components that are of type routed */
#define ORTE_ROUTED_BASE_VERSION_2_0_0 \
ORTE_MCA_BASE_VERSION_2_1_0("routed", 2, 0, 0)
#define ORTE_ROUTED_BASE_VERSION_3_0_0 \
ORTE_MCA_BASE_VERSION_2_1_0("routed", 3, 0, 0)
/* ******************************************************************** */

Просмотреть файл

@ -9,7 +9,7 @@
* All rights reserved.
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -767,7 +767,8 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
}
}
if (0 > (ret = orte_rml.send_buffer_nb(peer, loc_buffer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
peer, loc_buffer,
ORTE_RML_TAG_CKPT,
orte_rml_send_callback, NULL))) {
opal_output(orte_snapc_base_framework.framework_output,

Просмотреть файл

@ -21,9 +21,10 @@
#include "orte/runtime/orte_wait.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
@ -526,6 +527,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
orte_job_t *jdata;
orte_proc_t *pdata;
int i;
char *rtmod;
opal_output_verbose(5, orte_state_base_framework.framework_output,
"%s state:base:track_procs called for proc %s state %s",
@ -533,6 +535,9 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state));
/* get our "lifeline" routed module */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
/* get the job object for this proc */
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
@ -608,7 +613,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
* remain (might be some from another job)
*/
if (orte_orteds_term_ordered &&
0 == orte_routed.num_routes()) {
0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
@ -663,12 +668,17 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
int32_t i32, *i32ptr;
uint32_t u32;
void *nptr;
char *rtmod;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:base:check_job_complete on job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid));
/* get our "lifeline" routed module */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* just check to see if the daemons are complete */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
@ -739,7 +749,7 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
*/
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes(rtmod)) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s orteds complete - exiting",

Просмотреть файл

@ -21,11 +21,13 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/filem/filem.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/plm/base/base.h"
#include "orte/mca/ras/base/base.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/nidmap.h"
#include "orte/util/session_dir.h"
@ -279,7 +281,7 @@ static void vm_ready(int fd, short args, void *cbdata)
opal_dss.pack(buf, &flag, 1, OPAL_INT8);
/* get wireup info for daemons per the selected routing module */
wireup = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (rc = orte_routed.get_wireup_info(wireup))) {
if (ORTE_SUCCESS != (rc = orte_rml_base_get_contact_info(ORTE_PROC_MY_NAME->jobid, wireup))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(wireup);
OBJ_RELEASE(buf);
@ -343,6 +345,7 @@ static void check_complete(int fd, short args, void *cbdata)
orte_node_t *node;
orte_job_map_t *map;
orte_std_cntr_t index;
char *rtmod;
opal_output_verbose(2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_complete on job %s",
@ -354,7 +357,8 @@ static void check_complete(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:dvm:check_job_complete - received NULL job, checking daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (0 == orte_routed.num_routes()) {
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (0 == orte_routed.num_routes(rtmod)) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s orteds complete - exiting",

Просмотреть файл

@ -21,7 +21,7 @@
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/iof/iof.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/session_dir.h"
@ -231,7 +231,8 @@ static void track_jobs(int fd, short argc, void *cbdata)
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -253,6 +254,7 @@ static void track_procs(int fd, short argc, void *cbdata)
opal_buffer_t *alert;
int rc, i;
orte_plm_cmd_flag_t cmd;
char *rtmod;
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
"%s state:orted:track_procs called for proc %s state %s",
@ -308,7 +310,8 @@ static void track_procs(int fd, short argc, void *cbdata)
}
}
/* send it */
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -365,8 +368,9 @@ static void track_procs(int fd, short argc, void *cbdata)
* gone, then terminate ourselves IF no local procs
* remain (might be some from another job)
*/
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (orte_orteds_term_ordered &&
0 == orte_routed.num_routes()) {
0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
@ -404,7 +408,8 @@ static void track_procs(int fd, short argc, void *cbdata)
"%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, alert,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -62,7 +62,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/iof/iof_types.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls.h"
@ -120,6 +120,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
char string[256], *string_ptr = string;
float pss;
opal_pstats_t pstat;
char *rtmod;
/* unpack the command */
n = 1;
@ -359,7 +360,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
/* flag that orteds were ordered to terminate */
orte_orteds_term_ordered = true;
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == (ret = orte_routed.num_routes())) {
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (0 == (ret = orte_routed.num_routes(rtmod))) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
@ -397,7 +399,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
orte_orteds_term_ordered = true;
if (ORTE_PROC_IS_HNP) {
/* if all my routes and local children are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
if (0 == orte_routed.num_routes(rtmod)) {
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
@ -497,7 +500,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
break;
}
/* send the buffer to our IOF */
orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, iofbuf, ORTE_RML_TAG_IOF_HNP,
orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_NAME, iofbuf, ORTE_RML_TAG_IOF_HNP,
orte_rml_send_callback, NULL);
}
for (i=1; i < orte_node_pool->size; i++) {
@ -567,7 +571,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
goto CLEANUP;
}
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -592,7 +597,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -661,7 +667,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
rc = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&jobdat, nptr, &nptr);
}
}
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -687,7 +694,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -756,7 +764,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
}
/* send the info */
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -782,7 +791,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
OBJ_RELEASE(answer);
goto CLEANUP;
}
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -900,7 +910,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
}
/* send the info */
if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -958,7 +969,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
goto SEND_TOP_ANSWER;
}
/* the callback function will release relay_msg buffer */
if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg,
if (0 > orte_rml.send_buffer_nb(orte_mgmt_conduit,
&proc2, relay_msg,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
@ -1009,7 +1021,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
goto SEND_TOP_ANSWER;
}
/* the callback function will release relay_msg buffer */
if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg,
if (0 > orte_rml.send_buffer_nb(orte_mgmt_conduit,
&proc2, relay_msg,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL)) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
@ -1073,7 +1086,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
ret = ORTE_ERR_COMM_FAILURE;
break;
}
if (0 > (ret = orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
return_addr, answer, ORTE_RML_TAG_TOOL,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
@ -1151,7 +1165,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
free(gstack_exec);
}
/* always send our response */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, answer,
ORTE_RML_TAG_STACK_TRACE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
@ -1189,7 +1204,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
opal_dss.pack(answer, &pss, 1, OPAL_FLOAT);
/* send it back */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, answer,
ORTE_RML_TAG_MEMPROFILE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -696,14 +696,14 @@ int orte_daemon(int argc, char *argv[])
/* tell the routed module that we have a path
* back to the HNP
*/
if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
ORTE_ERROR_LOG(ret);
goto DONE;
}
/* set the lifeline to point to our parent so that we
* can handle the situation if that lifeline goes away
*/
if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
ORTE_ERROR_LOG(ret);
goto DONE;
}
@ -808,7 +808,8 @@ int orte_daemon(int argc, char *argv[])
}
/* send to the HNP's callback - will be routed if routes are available */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
if (0 > (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
ORTE_PROC_MY_HNP, buffer,
ORTE_RML_TAG_ORTED_CALLBACK,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);

Просмотреть файл

@ -553,14 +553,14 @@ int orte_submit_init(int argc, char *argv[],
exit(1);
}
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
if (ORTE_SUCCESS != orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri);
orte_finalize();
exit(1);
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(ORTE_PROC_MY_HNP);
orte_routed.set_lifeline(NULL, ORTE_PROC_MY_HNP);
/* setup to listen for HNP response to my commands */
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE,
@ -634,7 +634,8 @@ int orte_submit_cancel(int index) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON,
rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
@ -657,7 +658,8 @@ int orte_submit_halt(void)
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req,
rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, req,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
if (ORTE_SUCCESS != rc) {
@ -1041,7 +1043,9 @@ int orte_submit_job(char *argv[], int *index,
ORTE_ERROR_LOG(rc);
return rc;
}
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL);
orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL);
/* Inform the caller of the tracker index if they passed a index pointer */
if (NULL != index) {
@ -2844,7 +2848,7 @@ static void run_debugger(char *basename, opal_cmd_line_t *cmd_line,
{
int i, id, ret;
char **new_argv = NULL;
const char **tmp;
const char **tmp = NULL;
char *value, **lines, *env_name;
/* Get the orte_base_debug MCA parameter and search for a debugger
@ -3174,7 +3178,8 @@ void orte_profile_wakeup(int sd, short args, void *cbdata)
for (i=0; i < nreports; i++) {
OBJ_RETAIN(buffer);
name.vpid = i;
if (0 > (rc = orte_rml.send_buffer_nb(&name, buffer,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&name, buffer,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -353,9 +353,9 @@ int pmix_server_init(void)
struct timeval timeout;
timeout.tv_sec = orte_pmix_server_globals.timeout;
timeout.tv_usec = 0;
if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) {
/* try it one more time */
if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) {
/* okay give up */
orte_show_help("help-orterun.txt", "orterun:server-not-found", true,
orte_basename, server,
@ -416,7 +416,8 @@ static void send_error(int status, opal_process_name_t *idreq,
return;
}
/* send the response */
orte_rml.send_buffer_nb(remote, reply,
orte_rml.send_buffer_nb(orte_mgmt_conduit,
remote, reply,
ORTE_RML_TAG_DIRECT_MODEX_RESP,
orte_rml_send_callback, NULL);
return;
@ -454,7 +455,8 @@ static void _mdxresp(int sd, short args, void *cbdata)
opal_dss.copy_payload(reply, &req->msg);
/* send the response */
orte_rml.send_buffer_nb(&req->proxy, reply,
orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->proxy, reply,
ORTE_RML_TAG_DIRECT_MODEX_RESP,
orte_rml_send_callback, NULL);

Просмотреть файл

@ -133,7 +133,8 @@ static void spawn(int sd, short args, void *cbdata)
}
/* send it to the HNP for processing - might be myself! */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_PLM,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -230,7 +230,8 @@ static void dmodex_req(int sd, short args, void *cbdata)
}
/* send it to the host daemon */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(&dmn->name, buf, ORTE_RML_TAG_DIRECT_MODEX,
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&dmn->name, buf, ORTE_RML_TAG_DIRECT_MODEX,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
opal_hotel_checkout(&orte_pmix_server_globals.reqs, req->room_num);

Просмотреть файл

@ -576,7 +576,8 @@ void pmix_server_log_fn(opal_process_name_t *requestor,
buf = OBJ_NEW(opal_buffer_t);
opal_dss.load(buf, val->data.bo.bytes, val->data.bo.size);
val->data.bo.bytes = NULL;
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_SHOW_HELP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -13,7 +13,7 @@
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
@ -67,7 +67,8 @@ static void execute(int sd, short args, void *cbdata)
opal_dss.copy_payload(xfer, &req->msg);
/* send the request to the target */
rc = orte_rml.send_buffer_nb(&req->target, xfer,
rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->target, xfer,
ORTE_RML_TAG_DATA_SERVER,
orte_rml_send_callback, NULL);
if (ORTE_SUCCESS == rc) {

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2016 Los Alamos National Security, LLC.
* All rights reserved
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -314,7 +314,8 @@ void orte_data_server(int status, orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&req->requestor)));
if (0 > (rc = orte_rml.send_buffer_nb(&req->requestor, reply, ORTE_RML_TAG_DATA_CLIENT,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&req->requestor, reply, ORTE_RML_TAG_DATA_CLIENT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(reply);
@ -637,7 +638,8 @@ void orte_data_server(int status, orte_process_name_t* sender,
}
SEND_ANSWER:
if (0 > (rc = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_DATA_CLIENT,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
sender, answer, ORTE_RML_TAG_DATA_CLIENT,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(answer);

Просмотреть файл

@ -71,6 +71,10 @@ char *orte_basename = NULL;
bool orte_coprocessors_detected = false;
opal_hash_table_t *orte_coprocessors = NULL;
char *orte_topo_signature = NULL;
char *orte_mgmt_transport = NULL;
char *orte_coll_transport = NULL;
int orte_mgmt_conduit = -1;
int orte_coll_conduit = -1;
/* ORTE OOB port flags */
bool orte_static_ports = false;

Просмотреть файл

@ -71,6 +71,10 @@ ORTE_DECLSPEC extern bool orte_event_base_active; /* instantiated in orte/runtim
ORTE_DECLSPEC extern bool orte_proc_is_bound; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern int orte_progress_thread_debug; /* instantiated in orte/runtime/orte_init.c */
ORTE_DECLSPEC extern char *orte_mgmt_transport;
ORTE_DECLSPEC extern char *orte_coll_transport;
ORTE_DECLSPEC extern int orte_mgmt_conduit;
ORTE_DECLSPEC extern int orte_coll_conduit;
/**
* Global indicating where this process was bound to at launch (will

Просмотреть файл

@ -765,5 +765,18 @@ int orte_register_params(void)
/* register a synonym for old name */
mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
/* get the conduit params */
orte_coll_transport = "fabric,ethernet";
(void) mca_base_var_register("orte", "orte", "coll", "transports",
"Comma-separated list of transports to use for ORTE collectives",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &orte_coll_transport);
orte_mgmt_transport = "oob,ethernet";
(void) mca_base_var_register("orte", "orte", "mgmt", "transports",
"Comma-separated list of transports to use for ORTE management messages",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &orte_mgmt_transport);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -152,7 +152,7 @@ main(int argc, char *argv[]){
msg = (uint8_t*)malloc(msgsize);
opal_dss.pack(buf, msg, msgsize, OPAL_BYTE);
free(msg);
orte_rml.send_buffer_nb_conduit(conduit_id,&peer, buf, MY_TAG, orte_rml_send_callback, NULL);
orte_rml.send_buffer_nb(conduit_id,&peer, buf, MY_TAG, orte_rml_send_callback, NULL);
/* wait for it to come around */
OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t);
@ -180,7 +180,7 @@ main(int argc, char *argv[]){
opal_dss.copy_payload(buf, &blob.data);
OBJ_DESTRUCT(&blob);
msg_active = true;
orte_rml.send_buffer_nb_conduit(conduit_id,&peer, buf, MY_TAG, send_callback, NULL);
orte_rml.send_buffer_nb(conduit_id,&peer, buf, MY_TAG, send_callback, NULL);
ORTE_WAIT_FOR_COMPLETION(msg_active);
}
}

Просмотреть файл

@ -520,7 +520,10 @@ static void notify_requestor(int sd, short args, void *cbdata)
opal_dss.pack(reply, &pptr->node, 1, ORTE_NODE);
}
orte_rml.send_buffer_nb(&jdata->originator, reply, ORTE_RML_TAG_NOTIFY_COMPLETE, send_callback, jdata);
orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, reply,
ORTE_RML_TAG_NOTIFY_COMPLETE,
send_callback, jdata);
/* we cannot cleanup the job object as we might
* hit an error during transmission, so clean it

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -49,6 +49,7 @@
#include "opal/mca/event/event.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/routed/routed.h"
@ -184,9 +185,10 @@ static void send_cmd(int fd, short dummy, void *arg)
num_recvd = 0;
buf = OBJ_NEW(opal_buffer_t);
opal_dss.copy_payload(buf, &cmdbuf);
if (0 > (ret = orte_rml.send_buffer_nb(&(target_hnp->name), buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&(target_hnp->name), buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
orte_quit(0,0,NULL);
@ -207,6 +209,7 @@ main(int argc, char *argv[])
int i;
orte_vpid_t vstart, vend;
int vint;
char *rtmod;
/***************
* Initialize
@ -273,6 +276,9 @@ main(int argc, char *argv[])
return 1;
}
/* get our routed module */
rtmod = orte_rml.get_routed(orte_mgmt_conduit);
/* setup the list for recvd stats */
OBJ_CONSTRUCT(&recvd_stats, opal_list_t);
@ -423,7 +429,7 @@ main(int argc, char *argv[])
exit(1);
}
/* set the route to be direct */
if (ORTE_SUCCESS != orte_routed.update_route(&target_hnp->name, &target_hnp->name)) {
if (ORTE_SUCCESS != orte_routed.update_route(rtmod, &target_hnp->name, &target_hnp->name)) {
orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, target_hnp->rml_uri);
orte_finalize();
exit(1);
@ -435,7 +441,7 @@ main(int argc, char *argv[])
}
/* set the target hnp as our lifeline so we will terminate if it exits */
orte_routed.set_lifeline(&target_hnp->name);
orte_routed.set_lifeline(rtmod, &target_hnp->name);
/* if an output file was specified, open it */
if (NULL != logfile) {

Просмотреть файл

@ -332,7 +332,8 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key)
return "RML-DESIRED-PROVIDERS";
case ORTE_RML_PROTOCOL_ATTRIB:
return "RML-DESIRED-PROTOCOLS";
case ORTE_RML_ROUTED_ATTRIB:
return "RML-DESIRED-ROUTED-MODULES";
default:
return "UNKNOWN-KEY";
}

Просмотреть файл

@ -186,16 +186,16 @@ typedef uint16_t orte_proc_flags_t;
/*** RML ATTRIBUTE keys ***/
#define ORTE_RML_START_KEY ORTE_PROC_MAX_KEY
#define ORTE_RML_TRANSPORT_TYPE (ORTE_RML_START_KEY + 1) // string - null terminated string containing transport type
#define ORTE_RML_PROTOCOL_TYPE (ORTE_RML_START_KEY + 2) // string - protocol type (e.g., as returned by fi_info)
#define ORTE_RML_CONDUIT_ID (ORTE_RML_START_KEY + 3) // orte_rml_conduit_t - conduit_id for this transport
#define ORTE_RML_INCLUDE_COMP_ATTRIB (ORTE_RML_START_KEY + 4) // string - comma delimited list of RML component names to be considered
#define ORTE_RML_EXCLUDE_COMP_ATTRIB (ORTE_RML_START_KEY + 5) // string - comma delimited list of RML component names to be excluded
#define ORTE_RML_TRANSPORT_ATTRIB (ORTE_RML_START_KEY + 6) // string - comma delimited list of transport types to be considered (e.g., "fabric,ethernet")
#define ORTE_RML_QUALIFIER_ATTRIB (ORTE_RML_START_KEY + 7) // string - comma delimited list of qualifiers (e.g., routed=direct,bandwidth=xxx)
#define ORTE_RML_PROVIDER_ATTRIB (ORTE_RML_START_KEY + 8) // string - comma delimited list of provider names to be considered
#define ORTE_RML_PROTOCOL_ATTRIB (ORTE_RML_START_KEY + 9) // string - comma delimited list of protocols to be considered (e.g., tcp,udp)
#define ORTE_RML_TRANSPORT_TYPE (ORTE_RML_START_KEY + 1) // string - null terminated string containing transport type
#define ORTE_RML_PROTOCOL_TYPE (ORTE_RML_START_KEY + 2) // string - protocol type (e.g., as returned by fi_info)
#define ORTE_RML_CONDUIT_ID (ORTE_RML_START_KEY + 3) // orte_rml_conduit_t - conduit_id for this transport
#define ORTE_RML_INCLUDE_COMP_ATTRIB (ORTE_RML_START_KEY + 4) // string - comma delimited list of RML component names to be considered
#define ORTE_RML_EXCLUDE_COMP_ATTRIB (ORTE_RML_START_KEY + 5) // string - comma delimited list of RML component names to be excluded
#define ORTE_RML_TRANSPORT_ATTRIB (ORTE_RML_START_KEY + 6) // string - comma delimited list of transport types to be considered (e.g., "fabric,ethernet")
#define ORTE_RML_QUALIFIER_ATTRIB (ORTE_RML_START_KEY + 7) // string - comma delimited list of qualifiers (e.g., routed=direct,bandwidth=xxx)
#define ORTE_RML_PROVIDER_ATTRIB (ORTE_RML_START_KEY + 8) // string - comma delimited list of provider names to be considered
#define ORTE_RML_PROTOCOL_ATTRIB (ORTE_RML_START_KEY + 9) // string - comma delimited list of protocols to be considered (e.g., tcp,udp)
#define ORTE_RML_ROUTED_ATTRIB (ORTE_RML_START_KEY + 10) // string - comma delimited list of routed modules to be considered
#define ORTE_ATTR_KEY_MAX 1000

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -54,8 +54,8 @@ static void quicktime_cb(int fd, short event, void *cbdata)
{
/* release the timer */
if (NULL != quicktime) {
opal_event_free(quicktime);
quicktime = NULL;
opal_event_free(quicktime);
quicktime = NULL;
}
/* cancel the recv */
@ -73,7 +73,7 @@ static void send_cbfunc(int status, orte_process_name_t* sender,
/* cancel the timer */
if (NULL != quicktime) {
opal_event_free(quicktime);
quicktime = NULL;
quicktime = NULL;
}
/* declare the work done */
timer_fired = true;
@ -90,7 +90,7 @@ static void recv_info(int status, orte_process_name_t* sender,
/* cancel the timer */
if (NULL != quicktime) {
opal_event_free (quicktime);
quicktime = NULL;
quicktime = NULL;
}
/* xfer the answer */
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&answer, buffer))) {
@ -120,7 +120,7 @@ int orte_util_comm_connect_tool(char *uri)
}
/* set the route to be direct */
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&tool, &tool))) {
if (ORTE_SUCCESS != (rc = orte_routed.update_route(NULL, &tool, &tool))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -185,7 +185,8 @@ int orte_util_comm_report_event(orte_comm_event_t ev)
opal_event_evtimer_add(quicktime, &tv);
/* do the send */
if (0 > (rc = orte_rml.send_buffer_nb(&tool, buf, ORTE_RML_TAG_TOOL, send_cbfunc, NULL))) {
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&tool, buf, ORTE_RML_TAG_TOOL, send_cbfunc, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
return rc;
@ -275,7 +276,9 @@ int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t j
opal_event_evtimer_add(quicktime, &tv);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, send_cbfunc, NULL))) {
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, cmd,
ORTE_RML_TAG_DAEMON, send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
@ -382,7 +385,9 @@ int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
opal_event_evtimer_add(quicktime, &tv);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, send_cbfunc, NULL))) {
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, cmd,
ORTE_RML_TAG_DAEMON, send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
return ret;
@ -498,7 +503,8 @@ int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t
opal_event_evtimer_add(quicktime, &tv);
/* do the send */
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON,
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON,
send_cbfunc, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cmd);
@ -647,7 +653,8 @@ int orte_util_comm_spawn_job(const orte_process_name_t *hnp, orte_job_t *jdata)
ORTE_NAME_PRINT(hnp)));
/* tell the target HNP to launch the job */
if (0 > (rc = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, buf,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -728,7 +735,8 @@ int orte_util_comm_terminate_job(const orte_process_name_t *hnp, orte_jobid_t jo
ORTE_NAME_PRINT(hnp)));
/* tell the target HNP to terminate the job */
if (0 > (rc = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, buf,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -785,7 +793,8 @@ int orte_util_comm_halt_vm(const orte_process_name_t *hnp)
}
/* send the order */
if (0 > (rc = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, buf,
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
(orte_process_name_t*)hnp, buf,
ORTE_RML_TAG_DAEMON,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -143,7 +144,7 @@ int orte_read_hnp_contact_file(char *filename, orte_hnp_contact_t *hnp, bool con
}
/* set the route to be direct */
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&hnp->name, &hnp->name))) {
if (ORTE_SUCCESS != (rc = orte_routed.update_route(NULL, &hnp->name, &hnp->name))) {
ORTE_ERROR_LOG(rc);
free(hnp_uri);
return rc;
@ -161,9 +162,9 @@ static char *orte_getline(FILE *fp)
ret = fgets(input, ORTE_HNP_CONTACT_FILE_MAX_LINE_LENGTH, fp);
if (NULL != ret) {
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
input[strlen(input)-1] = '\0'; /* remove newline */
buff = strdup(input);
return buff;
}
return NULL;

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -691,7 +692,8 @@ int orte_show_help_norender(const char *filename, const char *topic,
/* if we are a daemon, then send it via RML to the HNP */
if (ORTE_PROC_IS_DAEMON) {
/* send it to the HNP */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_SHOW_HELP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -784,7 +786,8 @@ int orte_show_help_suppress(const char *filename, const char *topic)
/* pack the flag that we DO NOT have a string */
opal_dss.pack(buf, &have_output, 1, OPAL_INT8);
/* send it to the HNP */
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
ORTE_PROC_MY_HNP, buf,
ORTE_RML_TAG_SHOW_HELP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);