Use the RTE framework instead of calling ORTE directly.
Brian (rightfully) hit me on the head with the don't-use-ORTE-use-the-rte-framework clue bat; the usnic BTL now nicely plays with the RTE framework. This commit was SVN r28907.
Этот коммит содержится в:
родитель
ca9da8a554
Коммит
4b6006402d
@ -111,9 +111,9 @@ typedef struct ompi_btl_usnic_component_t {
|
|||||||
char *if_exclude;
|
char *if_exclude;
|
||||||
uint32_t *vendor_part_ids;
|
uint32_t *vendor_part_ids;
|
||||||
|
|
||||||
/* Cached hashed version of my ORTE proc name (to stuff in
|
/* Cached hashed version of my RTE proc name (to stuff in
|
||||||
protocol headers) */
|
protocol headers) */
|
||||||
uint64_t my_hashed_orte_name;
|
uint64_t my_hashed_rte_name;
|
||||||
|
|
||||||
/** array of available BTLs */
|
/** array of available BTLs */
|
||||||
struct ompi_btl_usnic_module_t* usnic_modules;
|
struct ompi_btl_usnic_module_t* usnic_modules;
|
||||||
|
@ -49,11 +49,9 @@
|
|||||||
#include "opal/util/if.h"
|
#include "opal/util/if.h"
|
||||||
#include "opal/mca/base/mca_base_var.h"
|
#include "opal/mca/base/mca_base_var.h"
|
||||||
#include "opal/mca/memchecker/base/base.h"
|
#include "opal/mca/memchecker/base/base.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "ompi/mca/rte/rte.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
|
|
||||||
#include "ompi/constants.h"
|
#include "ompi/constants.h"
|
||||||
#include "ompi/mca/btl/btl.h"
|
#include "ompi/mca/btl/btl.h"
|
||||||
#include "ompi/mca/btl/base/base.h"
|
#include "ompi/mca/btl/base/base.h"
|
||||||
@ -275,9 +273,9 @@ static int check_reg_mem_basics(void)
|
|||||||
asprintf(&str_limit, "Unknown");
|
asprintf(&str_limit, "Unknown");
|
||||||
}
|
}
|
||||||
|
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "check_reg_mem_basics fail",
|
opal_show_help("help-mpi-btl-usnic.txt", "check_reg_mem_basics fail",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
str_limit);
|
str_limit);
|
||||||
|
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
@ -352,8 +350,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
/* initialization */
|
/* initialization */
|
||||||
mca_btl_usnic_component.my_hashed_orte_name =
|
mca_btl_usnic_component.my_hashed_rte_name =
|
||||||
orte_util_hash_name(&(ompi_proc_local()->proc_name));
|
ompi_rte_hash_name(&(ompi_proc_local()->proc_name));
|
||||||
|
|
||||||
seed_prng();
|
seed_prng();
|
||||||
|
|
||||||
@ -375,7 +373,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
malloc(mca_btl_usnic_component.num_modules *
|
malloc(mca_btl_usnic_component.num_modules *
|
||||||
sizeof(ompi_btl_usnic_module_t*));
|
sizeof(ompi_btl_usnic_module_t*));
|
||||||
if (NULL == btls) {
|
if (NULL == btls) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
btls = NULL;
|
btls = NULL;
|
||||||
goto free_include_list;
|
goto free_include_list;
|
||||||
}
|
}
|
||||||
@ -386,7 +383,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
sizeof(ompi_btl_usnic_module_t));
|
sizeof(ompi_btl_usnic_module_t));
|
||||||
if (NULL == mca_btl_usnic_component.usnic_modules) {
|
if (NULL == mca_btl_usnic_component.usnic_modules) {
|
||||||
free(btls);
|
free(btls);
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
btls = NULL;
|
btls = NULL;
|
||||||
goto free_include_list;
|
goto free_include_list;
|
||||||
}
|
}
|
||||||
@ -415,7 +411,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
filter = NULL;
|
filter = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
num_local_procs = orte_process_info.num_local_peers;
|
num_local_procs = ompi_process_info.num_local_peers;
|
||||||
|
|
||||||
/* Go through the list of ports and determine if we want it or
|
/* Go through the list of ports and determine if we want it or
|
||||||
not. Create and (mostly) fill a module struct for each port
|
not. Create and (mostly) fill a module struct for each port
|
||||||
@ -467,9 +463,9 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
|
|
||||||
/* Query this device */
|
/* Query this device */
|
||||||
if (0 != ibv_query_device(module->device_context, &device_attr)) {
|
if (0 != ibv_query_device(module->device_context, &device_attr)) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_query_device", __FILE__, __LINE__,
|
"ibv_query_device", __FILE__, __LINE__,
|
||||||
@ -487,10 +483,10 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
char *str;
|
char *str;
|
||||||
asprintf(&str, "Not enough usNIC QPs (found %d, need %d)",
|
asprintf(&str, "Not enough usNIC QPs (found %d, need %d)",
|
||||||
device_attr.max_qp, num_local_procs * 2);
|
device_attr.max_qp, num_local_procs * 2);
|
||||||
orte_show_help("help-mpi-btl-usnic.txt",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
"not enough usnic resources",
|
"not enough usnic resources",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
str);
|
str);
|
||||||
free(str);
|
free(str);
|
||||||
@ -501,10 +497,10 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
char *str;
|
char *str;
|
||||||
asprintf(&str, "Not enough usNIC CQs (found %d, need %d)",
|
asprintf(&str, "Not enough usNIC CQs (found %d, need %d)",
|
||||||
device_attr.max_cq, num_local_procs * 2);
|
device_attr.max_cq, num_local_procs * 2);
|
||||||
orte_show_help("help-mpi-btl-usnic.txt",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
"not enough usnic resources",
|
"not enough usnic resources",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
str);
|
str);
|
||||||
free(str);
|
free(str);
|
||||||
@ -538,7 +534,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
* override.
|
* override.
|
||||||
*/
|
*/
|
||||||
if (-1 == mca_btl_usnic_component.prio_sd_num) {
|
if (-1 == mca_btl_usnic_component.prio_sd_num) {
|
||||||
module->prio_sd_num = max(128, 32*orte_process_info.num_procs) - 1;
|
module->prio_sd_num =
|
||||||
|
max(128, 32 * ompi_process_info.num_procs) - 1;
|
||||||
} else {
|
} else {
|
||||||
module->prio_sd_num = mca_btl_usnic_component.prio_sd_num;
|
module->prio_sd_num = mca_btl_usnic_component.prio_sd_num;
|
||||||
}
|
}
|
||||||
@ -546,7 +543,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
|
|||||||
module->prio_sd_num = device_attr.max_qp_wr;
|
module->prio_sd_num = device_attr.max_qp_wr;
|
||||||
}
|
}
|
||||||
if (-1 == mca_btl_usnic_component.prio_rd_num) {
|
if (-1 == mca_btl_usnic_component.prio_rd_num) {
|
||||||
module->prio_rd_num = max(128, 32*orte_process_info.num_procs) - 1;
|
module->prio_rd_num =
|
||||||
|
max(128, 32 * ompi_process_info.num_procs) - 1;
|
||||||
} else {
|
} else {
|
||||||
module->prio_rd_num = mca_btl_usnic_component.prio_rd_num;
|
module->prio_rd_num = mca_btl_usnic_component.prio_rd_num;
|
||||||
}
|
}
|
||||||
@ -908,7 +906,7 @@ static int usnic_component_progress(void)
|
|||||||
static void seed_prng(void)
|
static void seed_prng(void)
|
||||||
{
|
{
|
||||||
unsigned short seedv[3];
|
unsigned short seedv[3];
|
||||||
seedv[0] = ORTE_PROC_MY_NAME->vpid;
|
seedv[0] = OMPI_PROC_MY_NAME->vpid;
|
||||||
seedv[1] = opal_timer_base_get_cycles();
|
seedv[1] = opal_timer_base_get_cycles();
|
||||||
usleep(1);
|
usleep(1);
|
||||||
seedv[2] = opal_timer_base_get_cycles();
|
seedv[2] = opal_timer_base_get_cycles();
|
||||||
@ -963,9 +961,9 @@ static int init_module_from_port(ompi_btl_usnic_module_t *module,
|
|||||||
module->port_num,
|
module->port_num,
|
||||||
mca_btl_usnic_component.gid_index, &gid)) {
|
mca_btl_usnic_component.gid_index, &gid)) {
|
||||||
opal_memchecker_base_mem_defined(&gid, sizeof(gid));
|
opal_memchecker_base_mem_defined(&gid, sizeof(gid));
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_query_gid", __FILE__, __LINE__,
|
"ibv_query_gid", __FILE__, __LINE__,
|
||||||
@ -1011,9 +1009,9 @@ static int init_module_from_port(ompi_btl_usnic_module_t *module,
|
|||||||
/* If we don't get OMPI_SUCCESS, then we weren't able
|
/* If we don't get OMPI_SUCCESS, then we weren't able
|
||||||
to figure out what the bandwidth was of this port.
|
to figure out what the bandwidth was of this port.
|
||||||
That's a bad sign. Let's ignore this port. */
|
That's a bad sign. Let's ignore this port. */
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "verbs_port_bw failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "verbs_port_bw failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num);
|
module->port_num);
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -1077,7 +1075,6 @@ static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
|
|||||||
/* Get a wrapper for the filter */
|
/* Get a wrapper for the filter */
|
||||||
filter = calloc(sizeof(*filter), 1);
|
filter = calloc(sizeof(*filter), 1);
|
||||||
if (NULL == filter) {
|
if (NULL == filter) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1090,7 +1087,6 @@ static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
|
|||||||
/* upper bound: each entry could be a mask */
|
/* upper bound: each entry could be a mask */
|
||||||
filter->elts = malloc(sizeof(*filter->elts) * n_argv);
|
filter->elts = malloc(sizeof(*filter->elts) * n_argv);
|
||||||
if (NULL == filter->elts) {
|
if (NULL == filter->elts) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
free(filter);
|
free(filter);
|
||||||
opal_argv_free(argv);
|
opal_argv_free(argv);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -1120,8 +1116,8 @@ static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
|
|||||||
tmp = strdup(argv[i]);
|
tmp = strdup(argv[i]);
|
||||||
str = strchr(argv[i], '/');
|
str = strchr(argv[i], '/');
|
||||||
if (NULL == str) {
|
if (NULL == str) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
||||||
true, name, orte_process_info.nodename,
|
true, name, ompi_process_info.nodename,
|
||||||
tmp, "Invalid specification (missing \"/\")");
|
tmp, "Invalid specification (missing \"/\")");
|
||||||
free(tmp);
|
free(tmp);
|
||||||
continue;
|
continue;
|
||||||
@ -1129,8 +1125,8 @@ static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
|
|||||||
*str = '\0';
|
*str = '\0';
|
||||||
argv_prefix = atoi(str + 1);
|
argv_prefix = atoi(str + 1);
|
||||||
if (argv_prefix < 1 || argv_prefix > 32) {
|
if (argv_prefix < 1 || argv_prefix > 32) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
||||||
true, name, orte_process_info.nodename,
|
true, name, ompi_process_info.nodename,
|
||||||
tmp, "Invalid specification (prefix < 1 or prefix >32)");
|
tmp, "Invalid specification (prefix < 1 or prefix >32)");
|
||||||
free(tmp);
|
free(tmp);
|
||||||
continue;
|
continue;
|
||||||
@ -1141,8 +1137,8 @@ static usnic_if_filter_t *parse_ifex_str(const char *orig_str,
|
|||||||
ret = inet_pton(AF_INET, argv[i],
|
ret = inet_pton(AF_INET, argv[i],
|
||||||
&((struct sockaddr_in*) &argv_inaddr)->sin_addr);
|
&((struct sockaddr_in*) &argv_inaddr)->sin_addr);
|
||||||
if (1 != ret) {
|
if (1 != ret) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
opal_show_help("help-mpi-btl-usnic.txt", "invalid if_inexclude",
|
||||||
true, name, orte_process_info.nodename, tmp,
|
true, name, ompi_process_info.nodename, tmp,
|
||||||
"Invalid specification (inet_pton() failed)");
|
"Invalid specification (inet_pton() failed)");
|
||||||
free(tmp);
|
free(tmp);
|
||||||
continue;
|
continue;
|
||||||
|
@ -30,8 +30,7 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "opal/prefetch.h"
|
#include "opal/prefetch.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
|
|
||||||
#include "ompi/types.h"
|
#include "ompi/types.h"
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ common_send_seg_helper(
|
|||||||
bseg = &seg->ss_base;
|
bseg = &seg->ss_base;
|
||||||
|
|
||||||
bseg->us_btl_header = (ompi_btl_usnic_btl_header_t *)bseg->us_list.ptr;
|
bseg->us_btl_header = (ompi_btl_usnic_btl_header_t *)bseg->us_list.ptr;
|
||||||
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_orte_name;
|
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name;
|
||||||
|
|
||||||
/* build verbs work request descriptor */
|
/* build verbs work request descriptor */
|
||||||
seg->ss_send_desc.wr_id = (unsigned long) seg;
|
seg->ss_send_desc.wr_id = (unsigned long) seg;
|
||||||
|
@ -94,7 +94,7 @@ typedef enum {
|
|||||||
* holes.
|
* holes.
|
||||||
*/
|
*/
|
||||||
typedef struct {
|
typedef struct {
|
||||||
/* Hashed ORTE process name of the sender */
|
/* Hashed RTE process name of the sender */
|
||||||
uint64_t sender;
|
uint64_t sender;
|
||||||
|
|
||||||
/* Sliding window sequence number (echoed back in an ACK). This
|
/* Sliding window sequence number (echoed back in an ACK). This
|
||||||
|
@ -30,10 +30,9 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/datatype/opal_convertor.h"
|
#include "opal/datatype/opal_convertor.h"
|
||||||
#include "opal/include/opal_stdint.h"
|
#include "opal/include/opal_stdint.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include "orte/util/show_help.h"
|
#include "ompi/mca/rte/rte.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
|
|
||||||
#include "ompi/mca/btl/btl.h"
|
#include "ompi/mca/btl/btl.h"
|
||||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||||
#include "ompi/mca/mpool/base/base.h"
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
@ -1314,9 +1313,9 @@ static void module_async_event_callback(int fd, short flags, void *arg)
|
|||||||
case IBV_EVENT_GID_CHANGE:
|
case IBV_EVENT_GID_CHANGE:
|
||||||
#endif
|
#endif
|
||||||
default:
|
default:
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "async event",
|
opal_show_help("help-mpi-btl-usnic.txt", "async event",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
ibv_event_type_str(event.event_type),
|
ibv_event_type_str(event.event_type),
|
||||||
@ -1376,9 +1375,9 @@ init_qp(
|
|||||||
job is consuming QPs. */
|
job is consuming QPs. */
|
||||||
channel->qp = ibv_create_qp(module->pd, &qp_init_attr);
|
channel->qp = ibv_create_qp(module->pd, &qp_init_attr);
|
||||||
if (NULL == channel->qp) {
|
if (NULL == channel->qp) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "create ibv resource failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "create ibv resource failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
"ibv_create_qp()", __FILE__, __LINE__,
|
"ibv_create_qp()", __FILE__, __LINE__,
|
||||||
"Failed to create a usNIC queue pair");
|
"Failed to create a usNIC queue pair");
|
||||||
@ -1395,9 +1394,9 @@ init_qp(
|
|||||||
|
|
||||||
if (ibv_modify_qp(channel->qp, &qp_attr,
|
if (ibv_modify_qp(channel->qp, &qp_attr,
|
||||||
IBV_QP_STATE | IBV_QP_PORT)) {
|
IBV_QP_STATE | IBV_QP_PORT)) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_modify_qp()", __FILE__, __LINE__,
|
"ibv_modify_qp()", __FILE__, __LINE__,
|
||||||
@ -1410,9 +1409,9 @@ init_qp(
|
|||||||
memset(&qp_init_attr, 0, sizeof(qp_init_attr));
|
memset(&qp_init_attr, 0, sizeof(qp_init_attr));
|
||||||
if (ibv_query_qp(channel->qp, &qp_attr, IBV_QP_CAP,
|
if (ibv_query_qp(channel->qp, &qp_attr, IBV_QP_CAP,
|
||||||
&qp_init_attr) != 0) {
|
&qp_init_attr) != 0) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_query_qp()", __FILE__, __LINE__,
|
"ibv_query_qp()", __FILE__, __LINE__,
|
||||||
@ -1438,9 +1437,9 @@ static int move_qp_to_rtr(ompi_btl_usnic_module_t *module,
|
|||||||
|
|
||||||
qp_attr.qp_state = IBV_QPS_RTR;
|
qp_attr.qp_state = IBV_QPS_RTR;
|
||||||
if (ibv_modify_qp(channel->qp, &qp_attr, IBV_QP_STATE)) {
|
if (ibv_modify_qp(channel->qp, &qp_attr, IBV_QP_STATE)) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_modify_qp", __FILE__, __LINE__,
|
"ibv_modify_qp", __FILE__, __LINE__,
|
||||||
@ -1461,9 +1460,9 @@ static int move_qp_to_rts(ompi_btl_usnic_module_t *module,
|
|||||||
|
|
||||||
qp_attr.qp_state = IBV_QPS_RTS;
|
qp_attr.qp_state = IBV_QPS_RTS;
|
||||||
if (ibv_modify_qp(channel->qp, &qp_attr, IBV_QP_STATE)) {
|
if (ibv_modify_qp(channel->qp, &qp_attr, IBV_QP_STATE)) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_modify_qp", __FILE__, __LINE__,
|
"ibv_modify_qp", __FILE__, __LINE__,
|
||||||
@ -1534,9 +1533,9 @@ ompi_btl_usnic_channel_init(
|
|||||||
job is consuming CQs. */
|
job is consuming CQs. */
|
||||||
channel->cq = ibv_create_cq(ctx, module->cq_num, NULL, NULL, 0);
|
channel->cq = ibv_create_cq(ctx, module->cq_num, NULL, NULL, 0);
|
||||||
if (NULL == channel->cq) {
|
if (NULL == channel->cq) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "create ibv resource failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "create ibv resource failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
"ibv_create_cq()", __FILE__, __LINE__,
|
"ibv_create_cq()", __FILE__, __LINE__,
|
||||||
"Failed to create a usNIC completion queue");
|
"Failed to create a usNIC completion queue");
|
||||||
@ -1570,10 +1569,10 @@ ompi_btl_usnic_channel_init(
|
|||||||
rseg = (ompi_btl_usnic_recv_segment_t*)item;
|
rseg = (ompi_btl_usnic_recv_segment_t*)item;
|
||||||
|
|
||||||
if (NULL == rseg) {
|
if (NULL == rseg) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
"internal error during init",
|
"internal error during init",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"get freelist buffer()", __FILE__, __LINE__,
|
"get freelist buffer()", __FILE__, __LINE__,
|
||||||
@ -1586,9 +1585,9 @@ ompi_btl_usnic_channel_init(
|
|||||||
rseg->rs_recv_desc.next = NULL;
|
rseg->rs_recv_desc.next = NULL;
|
||||||
|
|
||||||
if (ibv_post_recv(channel->qp, &rseg->rs_recv_desc, &bad_wr)) {
|
if (ibv_post_recv(channel->qp, &rseg->rs_recv_desc, &bad_wr)) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_post_recv", __FILE__, __LINE__,
|
"ibv_post_recv", __FILE__, __LINE__,
|
||||||
@ -1641,15 +1640,15 @@ int ompi_btl_usnic_module_init(ompi_btl_usnic_module_t *module)
|
|||||||
/* Setup the pointer array for the procs that will be used by this
|
/* Setup the pointer array for the procs that will be used by this
|
||||||
module */
|
module */
|
||||||
OBJ_CONSTRUCT(&module->all_procs, opal_pointer_array_t);
|
OBJ_CONSTRUCT(&module->all_procs, opal_pointer_array_t);
|
||||||
opal_pointer_array_init(&module->all_procs, orte_process_info.num_procs,
|
opal_pointer_array_init(&module->all_procs, ompi_process_info.num_procs,
|
||||||
INT_MAX, 32);
|
INT_MAX, 32);
|
||||||
|
|
||||||
/* Get a PD */
|
/* Get a PD */
|
||||||
module->pd = ibv_alloc_pd(ctx);
|
module->pd = ibv_alloc_pd(ctx);
|
||||||
if (NULL == module->pd) {
|
if (NULL == module->pd) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_alloc_pd()", __FILE__, __LINE__,
|
"ibv_alloc_pd()", __FILE__, __LINE__,
|
||||||
@ -1668,9 +1667,9 @@ int ompi_btl_usnic_module_init(ompi_btl_usnic_module_t *module)
|
|||||||
mca_mpool_base_module_create(mca_btl_usnic_component.usnic_mpool_name,
|
mca_mpool_base_module_create(mca_btl_usnic_component.usnic_mpool_name,
|
||||||
&module->super, &mpool_resources);
|
&module->super, &mpool_resources);
|
||||||
if (NULL == module->super.btl_mpool) {
|
if (NULL == module->super.btl_mpool) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"create mpool", __FILE__, __LINE__,
|
"create mpool", __FILE__, __LINE__,
|
||||||
|
@ -25,10 +25,9 @@
|
|||||||
|
|
||||||
#include "opal_stdint.h"
|
#include "opal_stdint.h"
|
||||||
#include "opal/util/arch.h"
|
#include "opal/util/arch.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "ompi/mca/rte/rte.h"
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
|
|
||||||
#include "ompi/runtime/ompi_module_exchange.h"
|
#include "ompi/runtime/ompi_module_exchange.h"
|
||||||
#include "ompi/constants.h"
|
#include "ompi/constants.h"
|
||||||
|
|
||||||
@ -107,12 +106,12 @@ ompi_btl_usnic_proc_lookup_ompi(ompi_proc_t* ompi_proc)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Look for an existing usnic proc based on a hashed ORTE process
|
* Look for an existing usnic proc based on a hashed RTE process
|
||||||
* name.
|
* name.
|
||||||
*/
|
*/
|
||||||
ompi_btl_usnic_endpoint_t *
|
ompi_btl_usnic_endpoint_t *
|
||||||
ompi_btl_usnic_proc_lookup_endpoint(ompi_btl_usnic_module_t *receiver,
|
ompi_btl_usnic_proc_lookup_endpoint(ompi_btl_usnic_module_t *receiver,
|
||||||
uint64_t sender_hashed_orte_name)
|
uint64_t sender_hashed_rte_name)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
uint32_t mynet, peernet;
|
uint32_t mynet, peernet;
|
||||||
@ -125,8 +124,8 @@ ompi_btl_usnic_proc_lookup_endpoint(ompi_btl_usnic_module_t *receiver,
|
|||||||
opal_list_get_end(&mca_btl_usnic_component.usnic_procs);
|
opal_list_get_end(&mca_btl_usnic_component.usnic_procs);
|
||||||
proc = (ompi_btl_usnic_proc_t*)
|
proc = (ompi_btl_usnic_proc_t*)
|
||||||
opal_list_get_next(proc)) {
|
opal_list_get_next(proc)) {
|
||||||
if (orte_util_hash_name(&proc->proc_ompi->proc_name) ==
|
if (ompi_rte_hash_name(&proc->proc_ompi->proc_name) ==
|
||||||
sender_hashed_orte_name) {
|
sender_hashed_rte_name) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -186,9 +185,9 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
|
|||||||
&size);
|
&size);
|
||||||
|
|
||||||
if (OMPI_SUCCESS != rc) {
|
if (OMPI_SUCCESS != rc) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "internal error during init",
|
opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
"<none>", 0,
|
"<none>", 0,
|
||||||
"ompi_modex_recv() failed", __FILE__, __LINE__,
|
"ompi_modex_recv() failed", __FILE__, __LINE__,
|
||||||
opal_strerror(rc));
|
opal_strerror(rc));
|
||||||
@ -200,11 +199,11 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
|
|||||||
|
|
||||||
snprintf(msg, sizeof(msg),
|
snprintf(msg, sizeof(msg),
|
||||||
"sizeof(modex for peer %s data) == %d, expected multiple of %d",
|
"sizeof(modex for peer %s data) == %d, expected multiple of %d",
|
||||||
ORTE_NAME_PRINT(&ompi_proc->proc_name),
|
OMPI_NAME_PRINT(&ompi_proc->proc_name),
|
||||||
(int) size, (int) sizeof(ompi_btl_usnic_addr_t));
|
(int) size, (int) sizeof(ompi_btl_usnic_addr_t));
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "internal error during init",
|
opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
"<none>", 0,
|
"<none>", 0,
|
||||||
"invalid modex data", __FILE__, __LINE__,
|
"invalid modex data", __FILE__, __LINE__,
|
||||||
msg);
|
msg);
|
||||||
@ -223,7 +222,7 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
|
|||||||
proc->proc_modex_claimed = (bool*)
|
proc->proc_modex_claimed = (bool*)
|
||||||
calloc(proc->proc_modex_count, sizeof(bool));
|
calloc(proc->proc_modex_count, sizeof(bool));
|
||||||
if (NULL == proc->proc_modex_claimed) {
|
if (NULL == proc->proc_modex_claimed) {
|
||||||
ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||||
OBJ_RELEASE(proc);
|
OBJ_RELEASE(proc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@ -231,7 +230,7 @@ static ompi_btl_usnic_proc_t *create_proc(ompi_proc_t *ompi_proc)
|
|||||||
proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||||
calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
|
calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
|
||||||
if (NULL == proc->proc_endpoints) {
|
if (NULL == proc->proc_endpoints) {
|
||||||
ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
|
||||||
OBJ_RELEASE(proc);
|
OBJ_RELEASE(proc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@ -303,9 +302,9 @@ static int match_modex(ompi_btl_usnic_module_t *module,
|
|||||||
peer_hostname =
|
peer_hostname =
|
||||||
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||||
}
|
}
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
module->if_mtu,
|
module->if_mtu,
|
||||||
@ -333,7 +332,7 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
|||||||
if (modex_index < 0) {
|
if (modex_index < 0) {
|
||||||
opal_output_verbose(5, USNIC_OUT,
|
opal_output_verbose(5, USNIC_OUT,
|
||||||
"btl:usnic:create_endpoint: did not find usnic modex info for peer %s",
|
"btl:usnic:create_endpoint: did not find usnic modex info for peer %s",
|
||||||
ORTE_NAME_PRINT(&proc->proc_ompi->proc_name));
|
OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
|
||||||
return OMPI_ERR_NOT_FOUND;
|
return OMPI_ERR_NOT_FOUND;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -358,9 +357,9 @@ ompi_btl_usnic_create_endpoint(ompi_btl_usnic_module_t *module,
|
|||||||
|
|
||||||
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
|
endpoint->endpoint_remote_ah = ibv_create_ah(module->pd, &ah_attr);
|
||||||
if (NULL == endpoint->endpoint_remote_ah) {
|
if (NULL == endpoint->endpoint_remote_ah) {
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
opal_show_help("help-mpi-btl-usnic.txt", "ibv API failed",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
module->port_num,
|
module->port_num,
|
||||||
"ibv_create_ah()", __FILE__, __LINE__,
|
"ibv_create_ah()", __FILE__, __LINE__,
|
||||||
|
@ -67,7 +67,7 @@ struct ompi_btl_usnic_module_t;
|
|||||||
|
|
||||||
ompi_btl_usnic_endpoint_t *
|
ompi_btl_usnic_endpoint_t *
|
||||||
ompi_btl_usnic_proc_lookup_endpoint(struct ompi_btl_usnic_module_t *receiver,
|
ompi_btl_usnic_proc_lookup_endpoint(struct ompi_btl_usnic_module_t *receiver,
|
||||||
uint64_t sender_hashed_orte_name);
|
uint64_t sender_hashed_rte_name);
|
||||||
|
|
||||||
int ompi_btl_usnic_proc_match(ompi_proc_t* ompi_proc,
|
int ompi_btl_usnic_proc_match(ompi_proc_t* ompi_proc,
|
||||||
struct ompi_btl_usnic_module_t *module,
|
struct ompi_btl_usnic_module_t *module,
|
||||||
|
@ -53,9 +53,9 @@ lookup_sender(ompi_btl_usnic_module_t *module, ompi_btl_usnic_segment_t *seg)
|
|||||||
int ret;
|
int ret;
|
||||||
ompi_btl_usnic_endpoint_t *sender;
|
ompi_btl_usnic_endpoint_t *sender;
|
||||||
|
|
||||||
/* Use the hashed ORTE process name in the BTL header to uniquely
|
/* Use the hashed RTE process name in the BTL header to uniquely
|
||||||
identify the sending process (using the MAC/hardware address
|
identify the sending process (using the MAC/hardware address
|
||||||
only identifies the sending server -- not the sending ORTE
|
only identifies the sending server -- not the sending RTE
|
||||||
process). */
|
process). */
|
||||||
/* JMS Cesare suggests using a handshake before sending any data
|
/* JMS Cesare suggests using a handshake before sending any data
|
||||||
so that instead of looking up a hash on the btl_header->sender,
|
so that instead of looking up a hash on the btl_header->sender,
|
||||||
|
@ -13,9 +13,9 @@
|
|||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <infiniband/verbs.h>
|
#include <infiniband/verbs.h>
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "opal/util/show_help.h"
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
|
|
||||||
|
#include "ompi/mca/rte/rte.h"
|
||||||
#include "ompi/constants.h"
|
#include "ompi/constants.h"
|
||||||
|
|
||||||
#include "btl_usnic_util.h"
|
#include "btl_usnic_util.h"
|
||||||
@ -24,7 +24,7 @@
|
|||||||
|
|
||||||
void ompi_btl_usnic_exit(void)
|
void ompi_btl_usnic_exit(void)
|
||||||
{
|
{
|
||||||
orte_errmgr.abort(1, NULL);
|
ompi_rte_abort(1, NULL);
|
||||||
|
|
||||||
/* If the error manager returns, wait to be killed */
|
/* If the error manager returns, wait to be killed */
|
||||||
while (1) {
|
while (1) {
|
||||||
@ -181,16 +181,16 @@ uint32_t ompi_btl_usnic_get_ipv4_subnet(uint32_t addrn, uint32_t cidr_len)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Simple utility in a .c file, mainly so that inline functions in .h
|
* Simple utility in a .c file, mainly so that inline functions in .h
|
||||||
* files don't need to include ORTE header files.
|
* files don't need to include RTE header files.
|
||||||
*/
|
*/
|
||||||
void ompi_btl_usnic_util_abort(const char *msg, const char *file, int line,
|
void ompi_btl_usnic_util_abort(const char *msg, const char *file, int line,
|
||||||
int ret)
|
int ret)
|
||||||
{
|
{
|
||||||
orte_show_help("help-mpi-btl-usnic.txt", "internal error after init",
|
opal_show_help("help-mpi-btl-usnic.txt", "internal error after init",
|
||||||
true,
|
true,
|
||||||
orte_process_info.nodename,
|
ompi_process_info.nodename,
|
||||||
msg, file, line, strerror(ret));
|
msg, file, line, strerror(ret));
|
||||||
|
|
||||||
orte_errmgr.abort(ret, NULL);
|
ompi_rte_abort(ret, NULL);
|
||||||
/* Never returns */
|
/* Never returns */
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user