1
1

oob ud: made component more user adaptive; opal outputs were replaced by help messages.

Этот коммит содержится в:
Nadezhda Kogteva 2015-04-28 14:54:40 +03:00
родитель 18b75bd40d
Коммит 01ce58391e
8 изменённых файлов: 246 добавлений и 105 удалений

Просмотреть файл

@ -6,13 +6,114 @@
# Copyright (c) 2004-2006 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# 2015 Mellanox Technologies, Inc.
# All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
[no-devices-available]
No available RDMA devices found:
Hostname: %s
Please contact your system administrator.
#
[no-devices-error]
Failed to get list of the available RDMA devices:
Hostname: %s
Error: %s
#
[no-devices-usable]
No usable devices found:
Hostname: %s
#
[no-ports-usable]
No usable ports found:
Hostname: %s
#
[reg-mr-failed]
Failed to register memory region (MR):
Hostname: %s
Address: %x
Length: %lu
Error: %s
#
[notify-cq-failed]
Failed to request completion notification on a completion queue (CQ):
Hostname: %s
Error: %s
#
[create-cq-failed]
Failed to create a completion queue (CQ):
Hostname: %s
Requested CQE: %d
Error: %s
Check the CQE attribute.
#
[create-qp-failed]
Failed to create a queue pair (QP):
Hostname: %s
Requested max number of outstanding WRs in the SQ: %u
Requested max number of outstanding WRs in the RQ: %u
Requested max number of SGEs in a WR in the SQ: %u
Requested max number of SGEs in a WR in the RQ: %u
Requested max number of data that can be posted inline to the SQ: %u
Error: %s
Check requested attributes.
#
[poll-cq-failed]
Failed to poll the CQ cq for work completions:
Hostname: %s
Number of entries: %d
Error: %s
#
[poll-cq-failed-wc]
Failed to poll the CQ cq for work completions:
Hostname: %s
Number of entries: %d
Entry ID : %d
WC status: %d
#
[post-send-failed]
Failed to post a list of work requests (WRs) to a send queue:
Hostname: %s
Error: %s
#
[post-recv-failed]
Failed to post a list of work requests (WRs) to a receive queue:
Hostname: %s
Error: %s
#
[modify-qp-failed]
Failed to modify the attributes of a queue pair (QP):
Hostname: %s
Mask for QP attributes to be modified: %d
Error: %s
#
[destroy-qp-failed]
Failed to destroy a queue pair (QP):
Hostname: %s
Error: %s
#

Просмотреть файл

@ -21,6 +21,7 @@
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/routed/routed.h"
@ -217,8 +218,9 @@ int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr
IBV_ACCESS_REMOTE_WRITE);
if (NULL == ib_mr[iov_index]) {
/* Ruh-roh */
opal_output (0, "%s oob:ud:register_iov error registering memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, iov[iov_index].iov_base,
iov[iov_index].iov_len,strerror(errno));
return ORTE_ERR_OUT_OF_RESOURCE;
}
}
@ -264,8 +266,8 @@ int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE);
if (NULL == *ib_mr_buf) {
opal_output (0, "%s oob:ud:mca_oob_ud_register_buf error registering memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, buf, size, strerror(errno));
return ORTE_ERR_OUT_OF_RESOURCE;
}
}

Просмотреть файл

@ -19,9 +19,11 @@
#include "orte/types.h"
#include "opal/types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "oob_ud_component.h"
@ -168,6 +170,47 @@ static int mca_oob_ud_component_register (void)
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_timeout_usec);
mca_oob_ud_component.ud_qp_max_send_sge = 1;
(void)mca_base_component_var_register(component, "max_send_sge",
"Requested max number of outstanding WRs in the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_send_sge);
mca_oob_ud_component.ud_qp_max_recv_sge = 2;
(void)mca_base_component_var_register(component, "max_recv_sge",
"Requested max number of outstanding WRs in the RQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_recv_sge);
mca_oob_ud_component.ud_qp_max_send_wr = 4096;
(void)mca_base_component_var_register(component, "max_send_wr",
"Requested max number of scatter/gather (s/g) elements in a WR in the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_send_wr);
mca_oob_ud_component.ud_qp_max_recv_wr = 4096;
(void)mca_base_component_var_register(component, "max_recv_wr",
"Requested max number of scatter/gather (s/g) elements in a WR in the RQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_recv_wr);
mca_oob_ud_component.ud_qp_max_inline_data = 0;
(void)mca_base_component_var_register(component, "max_inline_data",
"Requested max number of data (bytes) that can be posted inline to the SQ",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_oob_ud_component.ud_qp_max_inline_data);
return ORTE_SUCCESS;
}
@ -253,7 +296,7 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);
if (NULL == port) {
opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -298,10 +341,15 @@ static int mca_oob_ud_component_startup(void)
}
devices = ibv_get_device_list (&num_devices);
if (NULL == devices || 0 == num_devices) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:component_init no devices found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
if (NULL == devices) {
orte_show_help("help-oob-ud.txt", "no-devices-error", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
if (0 == num_devices) {
orte_show_help("help-oob-ud.txt", "no-devices-available", true,
orte_process_info.nodename);
return ORTE_ERROR;
}
@ -309,8 +357,7 @@ static int mca_oob_ud_component_startup(void)
mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t);
if (NULL == device) {
opal_output (0, "oob:ud:component_init malloc failure. errno = %d",
errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERROR;
}
@ -330,9 +377,8 @@ static int mca_oob_ud_component_startup(void)
ibv_free_device_list (devices);
if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:component_init no usable devices found.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
orte_show_help("help-oob-ud.txt", "no-devices-usable", true,
orte_process_info.nodename);
return ORTE_ERROR;
}
@ -391,6 +437,8 @@ static int mca_oob_ud_component_startup(void)
}
if (!found_one) {
orte_show_help("help-oob-ud.txt", "no-ports-usable", true,
orte_process_info.nodename);
return ORTE_ERR_NOT_FOUND;
}
@ -550,8 +598,7 @@ static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer_name)
hop = orte_routed.get_route(peer_name);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
opal_output (0, "%s oob:ud:component_is_reachable peer %s is unreachable",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer_name));
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return false;
}
return true;
@ -621,9 +668,8 @@ static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port)
rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0);
if (0 != rc) {
opal_output (0, "%s oob:ud:port_recv_start error requesting completion"
"notifications. rc = %d, errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc, errno);
orte_show_help("help-oob-ud.txt", "notify-cq-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
@ -643,8 +689,7 @@ static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_me
posix_memalign ((void **)&reg_mem->ptr, sysconf(_SC_PAGESIZE), buffer_len);
if (NULL == reg_mem->ptr) {
opal_output (0, "%s oob:ud:alloc_reg_mem malloc failed! errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -652,8 +697,8 @@ static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_me
reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE);
if (NULL == reg_mem->mr) {
opal_output (0, "%s oob:ud:alloc_reg_mem failed to register memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, reg_mem->ptr, buffer_len, strerror(errno));
return ORTE_ERROR;
}

Просмотреть файл

@ -56,6 +56,11 @@ typedef struct {
int ud_max_retries; /**< max number of retries before declaring peer gone */
int ud_timeout_usec; /**< timeout in microsecond between peer retries */
int ud_qp_max_send_sge;
int ud_qp_max_recv_sge;
int ud_qp_max_send_wr;
int ud_qp_max_recv_wr;
int ud_qp_max_inline_data;
} mca_oob_ud_component_t;
ORTE_MODULE_DECLSPEC extern mca_oob_ud_component_t mca_oob_ud_component;

Просмотреть файл

@ -54,8 +54,7 @@ static inline int mca_oob_ud_parse_uri (const char *uri, uint32_t *qp_num,
rc = sscanf (uri, "ud://%u.%hu.%hu", qp_num, lid, port_num);
if (3 != rc) {
opal_output (0, "%s oob:ud:parse_uri error parsing uri. expected 3 elements. got %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
}

Просмотреть файл

@ -15,6 +15,7 @@
#include "oob_ud_component.h"
#include "oob_ud_qp.h"
#include "oob_ud.h"
#include "orte/util/show_help.h"
static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp);
static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp);
@ -49,21 +50,20 @@ int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
init_attr.qp_type = IBV_QPT_UD;
qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, 16384,
int cqe = 16384;
qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, cqe,
port, recv_channel, 0);
if (NULL == qp->ib_recv_cq) {
opal_output(orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init could not create recv completion queue. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
orte_process_info.nodename, cqe, strerror(errno));
return ORTE_ERROR;
}
if (false == onecq) {
qp->ib_send_cq = ibv_create_cq (port->device->ib_context, 16384,
qp->ib_send_cq = ibv_create_cq (port->device->ib_context, cqe,
port, send_channel, 0);
if (NULL == qp->ib_send_cq) {
opal_output(orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init could not create send completion queue. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
orte_process_info.nodename, cqe, strerror(errno));
return ORTE_ERROR;
}
} else {
@ -78,17 +78,18 @@ int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
"%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr);
init_attr.cap.max_send_sge = 1;
init_attr.cap.max_recv_sge = 2; /* GRH, data */
init_attr.cap.max_inline_data = 0; /* don't use inline data for now */
init_attr.cap.max_recv_wr = min(4096, device->attr.max_qp_wr);
init_attr.cap.max_send_wr = min(4096, device->attr.max_qp_wr);
init_attr.cap.max_send_sge = mca_oob_ud_component.ud_qp_max_send_sge;
init_attr.cap.max_recv_sge = mca_oob_ud_component.ud_qp_max_recv_sge; /* GRH, data */
init_attr.cap.max_inline_data = mca_oob_ud_component.ud_qp_max_inline_data;
init_attr.cap.max_recv_wr = min(mca_oob_ud_component.ud_qp_max_recv_wr, device->attr.max_qp_wr);
init_attr.cap.max_send_wr = min(mca_oob_ud_component.ud_qp_max_send_wr, device->attr.max_qp_wr);
qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr);
if (NULL == qp->ib_qp) {
opal_output_verbose(1, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_init could not create queue pair. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "create-qp-failed", true,
orte_process_info.nodename, init_attr.cap.max_send_sge, init_attr.cap.max_recv_sge,
init_attr.cap.max_send_wr, init_attr.cap.max_recv_wr, init_attr.cap.max_inline_data,
strerror(errno));
return ORTE_ERROR;
}
/* end: create the UD queue pair */
@ -107,8 +108,8 @@ int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp)
attr.qp_state = IBV_QPS_ERR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to ERR. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
return ORTE_ERROR;
}
@ -121,8 +122,8 @@ int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp)
attr.qp_state = IBV_QPS_RESET;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RESET. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
return ORTE_ERROR;
}
@ -145,8 +146,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to INIT. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
@ -154,8 +155,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
attr.qp_state = IBV_QPS_RTR;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTR. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
@ -166,8 +167,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTS. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
orte_process_info.nodename, attr_mask, strerror(errno));
return ORTE_ERROR;
}
@ -203,7 +204,8 @@ static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp)
/* destroy qp */
rc = ibv_destroy_qp (qp->ib_qp);
if (0 != rc) {
opal_output (0, "IBV_DESTROY_QP FAILED! rc = %d, errno = %d", rc, errno);
orte_show_help("help-oob-ud.txt", "destroy-qp-failed", true,
orte_process_info.nodename, strerror(errno));
}
}
@ -232,13 +234,14 @@ static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
for (count = 0 ; count < num_completions ; ) {
ret = ibv_poll_cq (qp->ib_send_cq, 1, wc);
if (ret < 0) {
opal_output (0, "%s oob:ud:qp_process_send_completions error polling for completions. "
"errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "poll-cq-failed", true,
orte_process_info.nodename, 1, strerror(errno));
return ORTE_ERROR;
}
for (i = 0 ; i < ret ; ++i) {
if (IBV_WC_SUCCESS != wc[i].status) {
opal_output (0, "wc status = %d", wc[i].status);
orte_show_help("help-oob-ud.txt", "poll-cq-failed-wc", true,
orte_process_info.nodename, 1, i, wc[i].status);
rc = ORTE_ERROR;
}
}
@ -255,23 +258,22 @@ int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr,
rc = ibv_post_send (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
opal_output (0, "%s oob:ud:qp_post_send ibv_post_send failed. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "post-send-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
return mca_oob_ud_qp_process_send_completions (qp, num_completions);
}
int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) {
struct ibv_recv_wr *bad_wr;
int rc;
rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr);
if (0 != rc) {
opal_output (0, "%s oob:ud:qp_post_recv failed. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "post-recv-failed", true,
orte_process_info.nodename, strerror(errno));
return ORTE_ERROR;
}
return ORTE_SUCCESS;
@ -287,7 +289,7 @@ int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t *
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s oob:ud:qp_data_aquire error allocating new data qp. error = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
break;
}

Просмотреть файл

@ -16,8 +16,10 @@
#include "orte/types.h"
#include "opal/types.h"
#include "orte/util/name_fns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "math.h"
@ -147,8 +149,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
/* allocate space for memory registers */
recv_req->req_data.iov.mr = (struct ibv_mr **) calloc (recv_req->req_data.iov.count, sizeof (struct ibv_mr *));
if (NULL == recv_req->req_data.iov.mr) {
opal_output (0, "%s oob:ud:recv_try error allocating space for memory registers. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
@ -196,8 +197,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
/* allocate work requests */
recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr));
if (NULL == recv_req->req_wr.recv) {
opal_output (0, "%s oob:ud:recv_try error allocating work requests. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
@ -207,8 +207,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
/* allocate scatter-gather lists. we need more to hold the grh */
recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == recv_req->req_sge) {
opal_output (0, "%s oob:ud:recv_try error allocating sges. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
@ -218,8 +217,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
/* allocate grh buffers */
recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh));
if (NULL == recv_req->req_grh) {
opal_output (0, "%s oob:ud:recv_try error allocating space for GRHs. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
}
@ -231,8 +229,9 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req)
wr_count * sizeof (struct ibv_grh),
IBV_ACCESS_LOCAL_WRITE);
if (NULL == recv_req->req_grh_mr) {
opal_output (0, "%s oob:ud:recv_try error allocating registering GRH memory. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
orte_process_info.nodename, recv_req->req_grh,
wr_count * sizeof (struct ibv_grh), strerror(errno));
/* could not register memory */
rc = ORTE_ERR_OUT_OF_RESOURCE;
break;
@ -481,8 +480,7 @@ int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer
rc = mca_oob_ud_get_recv_req (msg_hdr->msg_origin, msg_hdr->msg_data.req.tag, &req, msg_hdr->msg_data.req.data_iovec_used);
if (ORTE_SUCCESS != rc) {
opal_output(0, "%s oob:ud:recv_start mca_oob_ud_get_recv_req failed %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
return rc;
}
@ -496,7 +494,7 @@ int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer
do {
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:recv_start malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
ORTE_ERROR_LOG(rc);
free (req->req_data.iov.uiov);
OBJ_RELEASE(req);
req = NULL;

Просмотреть файл

@ -12,6 +12,7 @@
*
*/
#include "oob_ud_send.h"
#include "orte/mca/errmgr/errmgr.h"
static void mca_oob_ud_send_cb (mca_oob_ud_msg_t *msg, int rc)
{
@ -41,7 +42,7 @@ static int mca_oob_ud_send_self (orte_rml_send_t *msg)
rc = mca_oob_ud_recv_alloc (req);
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:mca_oob_ud_send_self malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
ORTE_ERROR_LOG(rc);
if (MCA_OOB_UD_REQ_IOV == req->req_data_type) {
free (req->req_data.iov.uiov);
}
@ -80,15 +81,13 @@ static int mca_oob_ud_send_self (orte_rml_send_t *msg)
buffer = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) {
opal_output (0, "%s oob:ud:mca_oob_ud_send_self copy_payload failed %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size)))
{
opal_output (0, "%s oob:ud:mca_oob_ud_send_self unload buffer failed %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
free(req->req_data.buf.p);
return rc;
@ -134,16 +133,13 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
hop = orte_routed.get_route(&op->msg->dst);
if (ORTE_JOBID_INVALID == hop.jobid ||
ORTE_VPID_INVALID == hop.vpid) {
opal_output (0, "%s oob:ud:send_nb peer %s is unreachable",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&op->msg->dst));
ORTE_ERROR_LOG(ORTE_ERR_UNREACH);
return ORTE_ERR_UNREACH;
}
rc = mca_oob_ud_peer_lookup (&hop, &peer);
if(ORTE_SUCCESS != rc || NULL == peer) {
opal_output (0, "%s oob:ud:send_nb peer %s not found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&hop));
ORTE_ERROR_LOG((NULL == peer) ? ORTE_ERR_UNREACH : rc);
return (NULL == peer) ? ORTE_ERR_UNREACH : rc;
}
@ -157,7 +153,7 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
send_req = OBJ_NEW(mca_oob_ud_req_t);
if (!send_req) {
opal_output(0, "oob:ud:send_nb malloc failed! errno = %d", errno);
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
@ -188,16 +184,14 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
buffer = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, op->msg->buffer))) {
opal_output (0, "%s oob:ud:send_nb copy_payload failed %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&send_req->req_data.buf.p, &send_req->req_data.buf.size)))
{
opal_output (0, "%s oob:ud:send_nb unload buffer failed %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
free(send_req->req_data.buf.p);
return rc;
@ -288,7 +282,7 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata)
/* send request */
rc = mca_oob_ud_msg_post_send (req_msg);
if (ORTE_SUCCESS != rc) {
opal_output (0, "msg send failed with status = %d", rc);
ORTE_ERROR_LOG(rc);
break;
}
} while (0);
@ -340,9 +334,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
/* allocate space for memory registers */
send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *));
if (NULL == send_req->req_data.iov.mr) {
opal_output (0, "%s oob:ud:send_try error allocating space for memory registers. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
@ -386,9 +379,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
if (wr_count && NULL == send_req->req_wr.send) {
send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr));
if (NULL == send_req->req_wr.send) {
opal_output (0, "%s oob:ud:send_try error allocating work requests. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
@ -397,9 +389,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge));
if (NULL == send_req->req_sge) {
opal_output (0, "%s oob:ud:send_try error allocating sges. errno = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno);
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(rc);
break;
}
}
@ -505,8 +496,7 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
/* send data */
rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0);
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:send_try error posting send!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
ORTE_ERROR_LOG(rc);
break;
}
@ -534,8 +524,7 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) {
}
if (ORTE_SUCCESS != rc) {
opal_output (0, "%s oob:ud:send_try send error! rc = %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
ORTE_ERROR_LOG(rc);
/* damn */
return mca_oob_ud_send_complete (send_req, rc);
}