diff --git a/orte/mca/oob/ud/help-oob-ud.txt b/orte/mca/oob/ud/help-oob-ud.txt index da489ee89a..51a76dfc9b 100644 --- a/orte/mca/oob/ud/help-oob-ud.txt +++ b/orte/mca/oob/ud/help-oob-ud.txt @@ -6,13 +6,114 @@ # Copyright (c) 2004-2006 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# 2015 Mellanox Technologies, Inc. +# All rights reserved. # $COPYRIGHT$ -# +# # Additional copyrights may follow -# +# # $HEADER$ # +[no-devices-available] +No available RDMA devices found: + +Hostname: %s + +Please contact your system administrator. +# +[no-devices-error] +Failed to get list of the available RDMA devices: + +Hostname: %s +Error: %s +# +[no-devices-usable] +No usable devices found: + +Hostname: %s +# +[no-ports-usable] +No usable ports found: + +Hostname: %s +# +[reg-mr-failed] +Failed to register memory region (MR): + +Hostname: %s +Address: %x +Length: %lu +Error: %s +# +[notify-cq-failed] +Failed to request completion notification on a completion queue (CQ): + +Hostname: %s +Error: %s +# +[create-cq-failed] +Failed to create a completion queue (CQ): + +Hostname: %s +Requested CQE: %d +Error: %s + +Check the CQE attribute. +# +[create-qp-failed] +Failed to create a queue pair (QP): + +Hostname: %s +Requested max number of outstanding WRs in the SQ: %u +Requested max number of outstanding WRs in the RQ: %u +Requested max number of SGEs in a WR in the SQ: %u +Requested max number of SGEs in a WR in the RQ: %u +Requested max number of data that can be posted inline to the SQ: %u +Error: %s + +Check requested attributes. +# +[poll-cq-failed] +Failed to poll the CQ cq for work completions: + +Hostname: %s +Number of entries: %d +Error: %s +# +[poll-cq-failed-wc] +Failed to poll the CQ cq for work completions: + +Hostname: %s +Number of entries: %d +Entry ID : %d +WC status: %d +# +[post-send-failed] +Failed to post a list of work requests (WRs) to a send queue: + +Hostname: %s +Error: %s +# +[post-recv-failed] +Failed to post a list of work requests (WRs) to a receive queue: + +Hostname: %s +Error: %s +# +[modify-qp-failed] +Failed to modify the attributes of a queue pair (QP): + +Hostname: %s +Mask for QP attributes to be modified: %d +Error: %s +# +[destroy-qp-failed] +Failed to destroy a queue pair (QP): + +Hostname: %s +Error: %s +# diff --git a/orte/mca/oob/ud/oob_ud.c b/orte/mca/oob/ud/oob_ud.c index e47fc07df3..247ef4a275 100644 --- a/orte/mca/oob/ud/oob_ud.c +++ b/orte/mca/oob/ud/oob_ud.c @@ -21,6 +21,7 @@ #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/util/proc_info.h" +#include "orte/util/show_help.h" #include "orte/mca/routed/routed.h" @@ -217,8 +218,9 @@ int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr IBV_ACCESS_REMOTE_WRITE); if (NULL == ib_mr[iov_index]) { /* Ruh-roh */ - opal_output (0, "%s oob:ud:register_iov error registering memory. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, + orte_process_info.nodename, iov[iov_index].iov_base, + iov[iov_index].iov_len,strerror(errno)); return ORTE_ERR_OUT_OF_RESOURCE; } } @@ -264,8 +266,8 @@ int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); if (NULL == *ib_mr_buf) { - opal_output (0, "%s oob:ud:mca_oob_ud_register_buf error registering memory. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, + orte_process_info.nodename, buf, size, strerror(errno)); return ORTE_ERR_OUT_OF_RESOURCE; } } diff --git a/orte/mca/oob/ud/oob_ud_component.c b/orte/mca/oob/ud/oob_ud_component.c index f60cc44f90..b97a445522 100644 --- a/orte/mca/oob/ud/oob_ud_component.c +++ b/orte/mca/oob/ud/oob_ud_component.c @@ -19,9 +19,11 @@ #include "orte/types.h" #include "opal/types.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" +#include "orte/util/show_help.h" #include "oob_ud_component.h" @@ -168,6 +170,47 @@ static int mca_oob_ud_component_register (void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_oob_ud_component.ud_timeout_usec); + + mca_oob_ud_component.ud_qp_max_send_sge = 1; + (void)mca_base_component_var_register(component, "max_send_sge", + "Requested max number of outstanding WRs in the SQ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_oob_ud_component.ud_qp_max_send_sge); + + mca_oob_ud_component.ud_qp_max_recv_sge = 2; + (void)mca_base_component_var_register(component, "max_recv_sge", + "Requested max number of outstanding WRs in the RQ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_oob_ud_component.ud_qp_max_recv_sge); + + + mca_oob_ud_component.ud_qp_max_send_wr = 4096; + (void)mca_base_component_var_register(component, "max_send_wr", + "Requested max number of scatter/gather (s/g) elements in a WR in the SQ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_oob_ud_component.ud_qp_max_send_wr); + + mca_oob_ud_component.ud_qp_max_recv_wr = 4096; + (void)mca_base_component_var_register(component, "max_recv_wr", + "Requested max number of scatter/gather (s/g) elements in a WR in the RQ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_oob_ud_component.ud_qp_max_recv_wr); + + mca_oob_ud_component.ud_qp_max_inline_data = 0; + (void)mca_base_component_var_register(component, "max_inline_data", + "Requested max number of data (bytes) that can be posted inline to the SQ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, + &mca_oob_ud_component.ud_qp_max_inline_data); return ORTE_SUCCESS; } @@ -253,7 +296,7 @@ static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device, mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t); if (NULL == port) { - opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } @@ -298,10 +341,15 @@ static int mca_oob_ud_component_startup(void) } devices = ibv_get_device_list (&num_devices); - if (NULL == devices || 0 == num_devices) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:component_init no devices found", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + if (NULL == devices) { + orte_show_help("help-oob-ud.txt", "no-devices-error", true, + orte_process_info.nodename, strerror(errno)); + return ORTE_ERROR; + } + + if (0 == num_devices) { + orte_show_help("help-oob-ud.txt", "no-devices-available", true, + orte_process_info.nodename); return ORTE_ERROR; } @@ -309,8 +357,7 @@ static int mca_oob_ud_component_startup(void) mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t); if (NULL == device) { - opal_output (0, "oob:ud:component_init malloc failure. errno = %d", - errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERROR; } @@ -330,9 +377,8 @@ static int mca_oob_ud_component_startup(void) ibv_free_device_list (devices); if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) { - opal_output_verbose(5, orte_oob_base_framework.framework_output, - "%s oob:ud:component_init no usable devices found.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + orte_show_help("help-oob-ud.txt", "no-devices-usable", true, + orte_process_info.nodename); return ORTE_ERROR; } @@ -391,6 +437,8 @@ static int mca_oob_ud_component_startup(void) } if (!found_one) { + orte_show_help("help-oob-ud.txt", "no-ports-usable", true, + orte_process_info.nodename); return ORTE_ERR_NOT_FOUND; } @@ -550,8 +598,7 @@ static bool mca_oob_ud_component_is_reachable(orte_process_name_t *peer_name) hop = orte_routed.get_route(peer_name); if (ORTE_JOBID_INVALID == hop.jobid || ORTE_VPID_INVALID == hop.vpid) { - opal_output (0, "%s oob:ud:component_is_reachable peer %s is unreachable", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer_name)); + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); return false; } return true; @@ -621,9 +668,8 @@ static inline int mca_oob_ud_port_recv_start (mca_oob_ud_port_t *port) rc = ibv_req_notify_cq (port->listen_qp.ib_recv_cq, 0); if (0 != rc) { - opal_output (0, "%s oob:ud:port_recv_start error requesting completion" - "notifications. rc = %d, errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc, errno); + orte_show_help("help-oob-ud.txt", "notify-cq-failed", true, + orte_process_info.nodename, strerror(errno)); return ORTE_ERROR; } @@ -643,8 +689,7 @@ static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_me posix_memalign ((void **)®_mem->ptr, sysconf(_SC_PAGESIZE), buffer_len); if (NULL == reg_mem->ptr) { - opal_output (0, "%s oob:ud:alloc_reg_mem malloc failed! errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } @@ -652,8 +697,8 @@ static inline int mca_oob_ud_alloc_reg_mem (struct ibv_pd *pd, mca_oob_ud_reg_me reg_mem->mr = ibv_reg_mr (pd, reg_mem->ptr, buffer_len, IBV_ACCESS_LOCAL_WRITE); if (NULL == reg_mem->mr) { - opal_output (0, "%s oob:ud:alloc_reg_mem failed to register memory. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, + orte_process_info.nodename, reg_mem->ptr, buffer_len, strerror(errno)); return ORTE_ERROR; } diff --git a/orte/mca/oob/ud/oob_ud_component.h b/orte/mca/oob/ud/oob_ud_component.h index a0db4aaf69..2c6aac1469 100644 --- a/orte/mca/oob/ud/oob_ud_component.h +++ b/orte/mca/oob/ud/oob_ud_component.h @@ -56,6 +56,11 @@ typedef struct { int ud_max_retries; /**< max number of retries before declaring peer gone */ int ud_timeout_usec; /**< timeout in microsecond between peer retries */ + int ud_qp_max_send_sge; + int ud_qp_max_recv_sge; + int ud_qp_max_send_wr; + int ud_qp_max_recv_wr; + int ud_qp_max_inline_data; } mca_oob_ud_component_t; ORTE_MODULE_DECLSPEC extern mca_oob_ud_component_t mca_oob_ud_component; diff --git a/orte/mca/oob/ud/oob_ud_peer.c b/orte/mca/oob/ud/oob_ud_peer.c index 04879ce73d..bda4b9a1e0 100644 --- a/orte/mca/oob/ud/oob_ud_peer.c +++ b/orte/mca/oob/ud/oob_ud_peer.c @@ -54,8 +54,7 @@ static inline int mca_oob_ud_parse_uri (const char *uri, uint32_t *qp_num, rc = sscanf (uri, "ud://%u.%hu.%hu", qp_num, lid, port_num); if (3 != rc) { - opal_output (0, "%s oob:ud:parse_uri error parsing uri. expected 3 elements. got %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } diff --git a/orte/mca/oob/ud/oob_ud_qp.c b/orte/mca/oob/ud/oob_ud_qp.c index 9700265f27..4718a3b569 100644 --- a/orte/mca/oob/ud/oob_ud_qp.c +++ b/orte/mca/oob/ud/oob_ud_qp.c @@ -15,6 +15,7 @@ #include "oob_ud_component.h" #include "oob_ud_qp.h" #include "oob_ud.h" +#include "orte/util/show_help.h" static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp); static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp); @@ -49,21 +50,20 @@ int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port, init_attr.qp_type = IBV_QPT_UD; - qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, 16384, + int cqe = 16384; + qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, cqe, port, recv_channel, 0); if (NULL == qp->ib_recv_cq) { - opal_output(orte_oob_base_framework.framework_output, - "%s oob:ud:qp_init could not create recv completion queue. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "create-cq-failed", true, + orte_process_info.nodename, cqe, strerror(errno)); return ORTE_ERROR; } if (false == onecq) { - qp->ib_send_cq = ibv_create_cq (port->device->ib_context, 16384, + qp->ib_send_cq = ibv_create_cq (port->device->ib_context, cqe, port, send_channel, 0); if (NULL == qp->ib_send_cq) { - opal_output(orte_oob_base_framework.framework_output, - "%s oob:ud:qp_init could not create send completion queue. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "create-cq-failed", true, + orte_process_info.nodename, cqe, strerror(errno)); return ORTE_ERROR; } } else { @@ -78,17 +78,18 @@ int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port, "%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr); - init_attr.cap.max_send_sge = 1; - init_attr.cap.max_recv_sge = 2; /* GRH, data */ - init_attr.cap.max_inline_data = 0; /* don't use inline data for now */ - init_attr.cap.max_recv_wr = min(4096, device->attr.max_qp_wr); - init_attr.cap.max_send_wr = min(4096, device->attr.max_qp_wr); + init_attr.cap.max_send_sge = mca_oob_ud_component.ud_qp_max_send_sge; + init_attr.cap.max_recv_sge = mca_oob_ud_component.ud_qp_max_recv_sge; /* GRH, data */ + init_attr.cap.max_inline_data = mca_oob_ud_component.ud_qp_max_inline_data; + init_attr.cap.max_recv_wr = min(mca_oob_ud_component.ud_qp_max_recv_wr, device->attr.max_qp_wr); + init_attr.cap.max_send_wr = min(mca_oob_ud_component.ud_qp_max_send_wr, device->attr.max_qp_wr); qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr); if (NULL == qp->ib_qp) { - opal_output_verbose(1, orte_oob_base_framework.framework_output, - "%s oob:ud:qp_init could not create queue pair. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "create-qp-failed", true, + orte_process_info.nodename, init_attr.cap.max_send_sge, init_attr.cap.max_recv_sge, + init_attr.cap.max_send_wr, init_attr.cap.max_recv_wr, init_attr.cap.max_inline_data, + strerror(errno)); return ORTE_ERROR; } /* end: create the UD queue pair */ @@ -107,8 +108,8 @@ int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp) attr.qp_state = IBV_QPS_ERR; if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to ERR. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, + orte_process_info.nodename, IBV_QP_STATE, strerror(errno)); return ORTE_ERROR; } @@ -121,8 +122,8 @@ int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp) attr.qp_state = IBV_QPS_RESET; if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RESET. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, + orte_process_info.nodename, IBV_QP_STATE, strerror(errno)); return ORTE_ERROR; } @@ -145,8 +146,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp) attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY; if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) { - opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to INIT. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, + orte_process_info.nodename, attr_mask, strerror(errno)); return ORTE_ERROR; } @@ -154,8 +155,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp) attr.qp_state = IBV_QPS_RTR; if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) { - opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTR. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, + orte_process_info.nodename, attr_mask, strerror(errno)); return ORTE_ERROR; } @@ -166,8 +167,8 @@ int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp) attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN; if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) { - opal_output(0, "%s oob:ud:qp_to_reset error modifying qp to RTS. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "modify-qp-failed", true, + orte_process_info.nodename, attr_mask, strerror(errno)); return ORTE_ERROR; } @@ -203,7 +204,8 @@ static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp) /* destroy qp */ rc = ibv_destroy_qp (qp->ib_qp); if (0 != rc) { - opal_output (0, "IBV_DESTROY_QP FAILED! rc = %d, errno = %d", rc, errno); + orte_show_help("help-oob-ud.txt", "destroy-qp-failed", true, + orte_process_info.nodename, strerror(errno)); } } @@ -232,13 +234,14 @@ static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp, for (count = 0 ; count < num_completions ; ) { ret = ibv_poll_cq (qp->ib_send_cq, 1, wc); if (ret < 0) { - opal_output (0, "%s oob:ud:qp_process_send_completions error polling for completions. " - "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "poll-cq-failed", true, + orte_process_info.nodename, 1, strerror(errno)); return ORTE_ERROR; } for (i = 0 ; i < ret ; ++i) { if (IBV_WC_SUCCESS != wc[i].status) { - opal_output (0, "wc status = %d", wc[i].status); + orte_show_help("help-oob-ud.txt", "poll-cq-failed-wc", true, + orte_process_info.nodename, 1, i, wc[i].status); rc = ORTE_ERROR; } } @@ -255,23 +258,22 @@ int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr, rc = ibv_post_send (qp->ib_qp, wr, &bad_wr); if (0 != rc) { - opal_output (0, "%s oob:ud:qp_post_send ibv_post_send failed. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "post-send-failed", true, + orte_process_info.nodename, strerror(errno)); return ORTE_ERROR; } return mca_oob_ud_qp_process_send_completions (qp, num_completions); } int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) { - + struct ibv_recv_wr *bad_wr; int rc; rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr); if (0 != rc) { - opal_output (0, "%s oob:ud:qp_post_recv failed. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); - + orte_show_help("help-oob-ud.txt", "post-recv-failed", true, + orte_process_info.nodename, strerror(errno)); return ORTE_ERROR; } return ORTE_SUCCESS; @@ -287,7 +289,7 @@ int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t * opal_output_verbose(5, orte_oob_base_framework.framework_output, "%s oob:ud:qp_data_aquire error allocating new data qp. error = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); - rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; + rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE; break; } diff --git a/orte/mca/oob/ud/oob_ud_recv.c b/orte/mca/oob/ud/oob_ud_recv.c index 7608488311..02b76c920f 100644 --- a/orte/mca/oob/ud/oob_ud_recv.c +++ b/orte/mca/oob/ud/oob_ud_recv.c @@ -16,8 +16,10 @@ #include "orte/types.h" #include "opal/types.h" -#include "orte/util/name_fns.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "math.h" @@ -147,8 +149,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) /* allocate space for memory registers */ recv_req->req_data.iov.mr = (struct ibv_mr **) calloc (recv_req->req_data.iov.count, sizeof (struct ibv_mr *)); if (NULL == recv_req->req_data.iov.mr) { - opal_output (0, "%s oob:ud:recv_try error allocating space for memory registers. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } @@ -196,8 +197,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) /* allocate work requests */ recv_req->req_wr.recv = (struct ibv_recv_wr *) calloc (wr_count, sizeof (struct ibv_recv_wr)); if (NULL == recv_req->req_wr.recv) { - opal_output (0, "%s oob:ud:recv_try error allocating work requests. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } @@ -207,8 +207,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) /* allocate scatter-gather lists. we need more to hold the grh */ recv_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); if (NULL == recv_req->req_sge) { - opal_output (0, "%s oob:ud:recv_try error allocating sges. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } @@ -218,8 +217,7 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) /* allocate grh buffers */ recv_req->req_grh = (struct ibv_grh *) calloc (wr_count, sizeof (struct ibv_grh)); if (NULL == recv_req->req_grh) { - opal_output (0, "%s oob:ud:recv_try error allocating space for GRHs. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; break; } @@ -231,8 +229,9 @@ int mca_oob_ud_recv_try (mca_oob_ud_req_t *recv_req) wr_count * sizeof (struct ibv_grh), IBV_ACCESS_LOCAL_WRITE); if (NULL == recv_req->req_grh_mr) { - opal_output (0, "%s oob:ud:recv_try error allocating registering GRH memory. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); + orte_show_help("help-oob-ud.txt", "reg-mr-failed", true, + orte_process_info.nodename, recv_req->req_grh, + wr_count * sizeof (struct ibv_grh), strerror(errno)); /* could not register memory */ rc = ORTE_ERR_OUT_OF_RESOURCE; break; @@ -481,8 +480,7 @@ int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer rc = mca_oob_ud_get_recv_req (msg_hdr->msg_origin, msg_hdr->msg_data.req.tag, &req, msg_hdr->msg_data.req.data_iovec_used); if (ORTE_SUCCESS != rc) { - opal_output(0, "%s oob:ud:recv_start mca_oob_ud_get_recv_req failed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); return rc; } @@ -496,7 +494,7 @@ int mca_oob_ud_recv_match_send (mca_oob_ud_port_t *port, mca_oob_ud_peer_t *peer do { rc = mca_oob_ud_recv_alloc (req); if (ORTE_SUCCESS != rc) { - opal_output (0, "%s oob:ud:recv_start malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + ORTE_ERROR_LOG(rc); free (req->req_data.iov.uiov); OBJ_RELEASE(req); req = NULL; diff --git a/orte/mca/oob/ud/oob_ud_send.c b/orte/mca/oob/ud/oob_ud_send.c index 748fcfe36c..13f4a6bfb0 100644 --- a/orte/mca/oob/ud/oob_ud_send.c +++ b/orte/mca/oob/ud/oob_ud_send.c @@ -12,6 +12,7 @@ * */ #include "oob_ud_send.h" +#include "orte/mca/errmgr/errmgr.h" static void mca_oob_ud_send_cb (mca_oob_ud_msg_t *msg, int rc) { @@ -41,7 +42,7 @@ static int mca_oob_ud_send_self (orte_rml_send_t *msg) rc = mca_oob_ud_recv_alloc (req); if (ORTE_SUCCESS != rc) { - opal_output (0, "%s oob:ud:mca_oob_ud_send_self malloc failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + ORTE_ERROR_LOG(rc); if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { free (req->req_data.iov.uiov); } @@ -80,15 +81,13 @@ static int mca_oob_ud_send_self (orte_rml_send_t *msg) buffer = OBJ_NEW(opal_buffer_t); if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, msg->buffer))) { - opal_output (0, "%s oob:ud:mca_oob_ud_send_self copy_payload failed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&req->req_data.buf.p, &req->req_data.buf.size))) { - opal_output (0, "%s oob:ud:mca_oob_ud_send_self unload buffer failed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); free(req->req_data.buf.p); return rc; @@ -134,16 +133,13 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) hop = orte_routed.get_route(&op->msg->dst); if (ORTE_JOBID_INVALID == hop.jobid || ORTE_VPID_INVALID == hop.vpid) { - opal_output (0, "%s oob:ud:send_nb peer %s is unreachable", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&op->msg->dst)); + ORTE_ERROR_LOG(ORTE_ERR_UNREACH); return ORTE_ERR_UNREACH; } rc = mca_oob_ud_peer_lookup (&hop, &peer); if(ORTE_SUCCESS != rc || NULL == peer) { - opal_output (0, "%s oob:ud:send_nb peer %s not found", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&hop)); + ORTE_ERROR_LOG((NULL == peer) ? ORTE_ERR_UNREACH : rc); return (NULL == peer) ? ORTE_ERR_UNREACH : rc; } @@ -157,7 +153,7 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) send_req = OBJ_NEW(mca_oob_ud_req_t); if (!send_req) { - opal_output(0, "oob:ud:send_nb malloc failed! errno = %d", errno); + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } @@ -188,16 +184,14 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) buffer = OBJ_NEW(opal_buffer_t); if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(buffer, op->msg->buffer))) { - opal_output (0, "%s oob:ud:send_nb copy_payload failed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } if (OPAL_SUCCESS != (rc = opal_dss.unload(buffer, (void **)&send_req->req_data.buf.p, &send_req->req_data.buf.size))) { - opal_output (0, "%s oob:ud:send_nb unload buffer failed %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); free(send_req->req_data.buf.p); return rc; @@ -288,7 +282,7 @@ int mca_oob_ud_process_send_nb(int fd, short args, void *cbdata) /* send request */ rc = mca_oob_ud_msg_post_send (req_msg); if (ORTE_SUCCESS != rc) { - opal_output (0, "msg send failed with status = %d", rc); + ORTE_ERROR_LOG(rc); break; } } while (0); @@ -340,9 +334,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { /* allocate space for memory registers */ send_req->req_data.iov.mr = (struct ibv_mr **) calloc (send_req->req_data.iov.count, sizeof (struct ibv_mr *)); if (NULL == send_req->req_data.iov.mr) { - opal_output (0, "%s oob:ud:send_try error allocating space for memory registers. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); break; } } @@ -386,9 +379,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { if (wr_count && NULL == send_req->req_wr.send) { send_req->req_wr.send = (struct ibv_send_wr *) calloc (wr_count, sizeof (struct ibv_send_wr)); if (NULL == send_req->req_wr.send) { - opal_output (0, "%s oob:ud:send_try error allocating work requests. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); break; } } @@ -397,9 +389,8 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { send_req->req_sge = (struct ibv_sge *) calloc (sge_count, sizeof (struct ibv_sge)); if (NULL == send_req->req_sge) { - opal_output (0, "%s oob:ud:send_try error allocating sges. errno = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno); rc = ORTE_ERR_OUT_OF_RESOURCE; + ORTE_ERROR_LOG(rc); break; } } @@ -505,8 +496,7 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { /* send data */ rc = mca_oob_ud_qp_post_send (send_req->req_qp, send_req->req_wr.send, 0); if (ORTE_SUCCESS != rc) { - opal_output (0, "%s oob:ud:send_try error posting send!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + ORTE_ERROR_LOG(rc); break; } @@ -534,8 +524,7 @@ int mca_oob_ud_send_try (mca_oob_ud_req_t *send_req) { } if (ORTE_SUCCESS != rc) { - opal_output (0, "%s oob:ud:send_try send error! rc = %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc); + ORTE_ERROR_LOG(rc); /* damn */ return mca_oob_ud_send_complete (send_req, rc); }