Merge pull request #1126 from alex-mikheev/topic/ikrit_err_fix
Topic/ikrit err fix
Этот коммит содержится в:
Коммит
8ec5c99412
@ -22,6 +22,7 @@
|
||||
|
||||
#include "oshmem_config.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/mca/memchecker/base/base.h"
|
||||
#include "orte/include/orte/types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "oshmem/mca/spml/ikrit/spml_ikrit.h"
|
||||
@ -41,6 +42,19 @@
|
||||
#define SPML_IKRIT_PUT_DEBUG 0
|
||||
#endif
|
||||
|
||||
#define SPML_IKRIT_MXM_POST_SEND(sreq) \
|
||||
do { \
|
||||
mxm_error_t err; \
|
||||
err = mxm_req_send(&sreq); \
|
||||
if (MXM_OK != err) { \
|
||||
SPML_ERROR("mxm_req_send (op=%d) failed: %s - aborting", \
|
||||
sreq.opcode, \
|
||||
mxm_error_string(err)); \
|
||||
oshmem_shmem_abort(-1); \
|
||||
return OSHMEM_ERROR; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
typedef struct spml_ikrit_am_hdr {
|
||||
uint64_t va;
|
||||
} spml_ikrit_am_hdr_t;
|
||||
@ -88,6 +102,7 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) {
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req)
|
||||
{
|
||||
while (!mxm_req_test(req))
|
||||
@ -99,11 +114,12 @@ static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request)
|
||||
mca_spml_ikrit_put_request_t *put_req =
|
||||
*(mca_spml_ikrit_put_request_t **) request;
|
||||
|
||||
assert(false == put_req->req_put.req_base.req_free_called);
|
||||
OPAL_THREAD_LOCK(&oshmem_request_lock);
|
||||
assert(false == put_req->req_put.req_base.req_free_called);
|
||||
put_req->req_put.req_base.req_free_called = true;
|
||||
opal_free_list_return (&mca_spml_base_put_requests,
|
||||
(opal_free_list_item_t*)put_req);
|
||||
opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req));
|
||||
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
|
||||
|
||||
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
|
||||
@ -147,11 +163,12 @@ static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request)
|
||||
mca_spml_ikrit_get_request_t *get_req =
|
||||
*(mca_spml_ikrit_get_request_t **) request;
|
||||
|
||||
assert(false == get_req->req_get.req_base.req_free_called);
|
||||
OPAL_THREAD_LOCK(&oshmem_request_lock);
|
||||
assert(false == get_req->req_get.req_base.req_free_called);
|
||||
get_req->req_get.req_base.req_free_called = true;
|
||||
opal_free_list_return (&mca_spml_base_get_requests,
|
||||
(opal_free_list_item_t*)get_req);
|
||||
opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req));
|
||||
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
|
||||
|
||||
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
|
||||
@ -167,7 +184,7 @@ static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request,
|
||||
|
||||
static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req)
|
||||
{
|
||||
req->req_get.req_base.req_type = MCA_SPML_REQUEST_PUT;
|
||||
req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET;
|
||||
req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free;
|
||||
req->req_get.req_base.req_oshmem.req_cancel =
|
||||
mca_spml_ikrit_get_request_cancel;
|
||||
@ -245,6 +262,10 @@ static inline mca_spml_ikrit_put_request_t *alloc_put_req(void)
|
||||
item = opal_free_list_wait (&mca_spml_base_put_requests);
|
||||
|
||||
req = (mca_spml_ikrit_put_request_t *) item;
|
||||
opal_memchecker_base_mem_undefined(req, sizeof(*req));
|
||||
opal_memchecker_base_mem_defined(&req->req_put.req_base,
|
||||
sizeof(req->req_put.req_base));
|
||||
|
||||
req->req_put.req_base.req_free_called = false;
|
||||
req->req_put.req_base.req_oshmem.req_complete = false;
|
||||
|
||||
@ -259,6 +280,10 @@ static inline mca_spml_ikrit_get_request_t *alloc_get_req(void)
|
||||
item = opal_free_list_wait (&mca_spml_base_get_requests);
|
||||
|
||||
req = (mca_spml_ikrit_get_request_t *) item;
|
||||
opal_memchecker_base_mem_undefined(req, sizeof(*req));
|
||||
opal_memchecker_base_mem_defined(&req->req_get.req_base,
|
||||
sizeof(req->req_get.req_base));
|
||||
|
||||
req->req_get.req_base.req_free_called = false;
|
||||
req->req_get.req_base.req_oshmem.req_complete = false;
|
||||
|
||||
@ -363,19 +388,15 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
|
||||
for (n = 0; n < nprocs; n++) {
|
||||
i = (my_rank + n) % nprocs;
|
||||
if (mca_spml_ikrit.mxm_peers[i]->mxm_conn) {
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
}
|
||||
if (mca_spml_ikrit.hw_rdma_channel && mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn) {
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
if (mca_spml_ikrit.hw_rdma_channel) {
|
||||
assert(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i]->mxm_conn);
|
||||
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn);
|
||||
}
|
||||
destroy_ptl_idx(i);
|
||||
if (mca_spml_ikrit.mxm_peers[i]) {
|
||||
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
if (mca_spml_ikrit.mxm_peers)
|
||||
free(mca_spml_ikrit.mxm_peers);
|
||||
free(mca_spml_ikrit.mxm_peers);
|
||||
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
@ -407,20 +428,18 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
}
|
||||
memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t));
|
||||
#endif
|
||||
ep_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
|
||||
if (NULL == ep_info) {
|
||||
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
goto bail;
|
||||
}
|
||||
memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
|
||||
if (mca_spml_ikrit.hw_rdma_channel) {
|
||||
ep_hw_rdma_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
|
||||
if (NULL == ep_hw_rdma_info) {
|
||||
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
goto bail;
|
||||
}
|
||||
memset(ep_hw_rdma_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
}
|
||||
|
||||
mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs
|
||||
@ -529,8 +548,10 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
/* Save returned connections */
|
||||
for (i = 0; i < nprocs; ++i) {
|
||||
mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn;
|
||||
if (OSHMEM_SUCCESS != create_ptl_idx(i))
|
||||
if (OSHMEM_SUCCESS != create_ptl_idx(i)) {
|
||||
rc = OSHMEM_ERR_CONNECTION_FAILED;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]);
|
||||
}
|
||||
@ -559,7 +580,7 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
continue;
|
||||
}
|
||||
if (procs[i] == proc_self)
|
||||
continue;
|
||||
continue;
|
||||
|
||||
/* use zcopy for put/get via sysv shared memory */
|
||||
procs[i]->transport_ids[0] = MXM_PTL_SHM;
|
||||
@ -629,7 +650,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr,
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
mkeys[i].len = 0;
|
||||
#else
|
||||
if (mca_spml_ikrit.ud_only && !mca_spml_ikrit.hw_rdma_channel) {
|
||||
if (mca_spml_ikrit.ud_only) {
|
||||
mkeys[i].len = 0;
|
||||
break;
|
||||
}
|
||||
@ -848,6 +869,7 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr,
|
||||
int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
|
||||
{
|
||||
mxm_send_req_t sreq;
|
||||
mxm_error_t err;
|
||||
|
||||
if (0 >= size) {
|
||||
return OSHMEM_SUCCESS;
|
||||
@ -873,10 +895,9 @@ int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
|
||||
#endif
|
||||
sreq.base.completed_cb = NULL;
|
||||
|
||||
mxm_req_send(&sreq);
|
||||
opal_progress();
|
||||
mca_spml_irkit_req_wait(&sreq.base);
|
||||
SPML_IKRIT_MXM_POST_SEND(sreq);
|
||||
|
||||
mca_spml_irkit_req_wait(&sreq.base);
|
||||
if (MXM_OK != sreq.base.error) {
|
||||
SPML_ERROR("get request failed: %s - aborting",
|
||||
mxm_error_string(sreq.base.error));
|
||||
@ -935,14 +956,8 @@ int mca_spml_ikrit_get_async(void *src_addr,
|
||||
get_req->mxm_req.base.context = get_req;
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1);
|
||||
|
||||
mxm_req_send(&get_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);
|
||||
|
||||
if (MXM_OK != get_req->mxm_req.base.error) {
|
||||
SPML_ERROR("get request failed: %s - aborting",
|
||||
mxm_error_string(get_req->mxm_req.base.error));
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
@ -989,7 +1004,7 @@ static int mca_spml_ikrit_mxm_fence(int dst)
|
||||
fence_req->mxm_req.base.context = fence_req;
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1);
|
||||
|
||||
mxm_req_send(&fence_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1162,15 +1177,8 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr,
|
||||
|
||||
mca_spml_ikrit.mxm_peers[dst]->n_active_puts++;
|
||||
|
||||
mxm_req_send(&put_req->mxm_req);
|
||||
SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);
|
||||
|
||||
if (MXM_OK != put_req->mxm_req.base.error) {
|
||||
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1);
|
||||
SPML_ERROR("put request %p failed: %s - aborting",
|
||||
(void*)put_req, mxm_error_string(put_req->mxm_req.base.error));
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
if (need_progress)
|
||||
mxm_progress(mca_spml_ikrit.mxm_context);
|
||||
|
||||
@ -1269,13 +1277,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr,
|
||||
mca_spml_ikrit.mxm_peers[dst]->need_fence = 1;
|
||||
}
|
||||
|
||||
mxm_req_send(&mxm_req);
|
||||
if (MXM_OK != mxm_req.base.error) {
|
||||
SPML_ERROR("put request failed: %s(%d) - aborting",
|
||||
mxm_error_string(mxm_req.base.error), mxm_req.base.error);
|
||||
oshmem_shmem_abort(-1);
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
SPML_IKRIT_MXM_POST_SEND(mxm_req);
|
||||
|
||||
wait.req = &mxm_req.base;
|
||||
wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
|
||||
@ -1432,7 +1434,8 @@ int mca_spml_ikrit_send(void* buf,
|
||||
req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
|
||||
req.base.data.buffer.memh = NULL;
|
||||
|
||||
mxm_req_send(&req);
|
||||
SPML_IKRIT_MXM_POST_SEND(req);
|
||||
|
||||
mca_spml_irkit_req_wait(&req.base);
|
||||
if (req.base.error != MXM_OK) {
|
||||
return OSHMEM_ERROR;
|
||||
|
@ -106,11 +106,11 @@ static inline int set_mxm_tls()
|
||||
|
||||
tls = getenv("MXM_TLS");
|
||||
if (NULL == tls) {
|
||||
setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1);
|
||||
return OSHMEM_SUCCESS;
|
||||
opal_setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1, &environ);
|
||||
return check_mxm_tls("MXM_OSHMEM_TLS");
|
||||
}
|
||||
if (OSHMEM_SUCCESS == check_mxm_tls("MXM_TLS")) {
|
||||
setenv("MXM_OSHMEM_TLS", tls, 1);
|
||||
opal_setenv("MXM_OSHMEM_TLS", tls, 1, &environ);
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
return OSHMEM_ERROR;
|
||||
@ -120,12 +120,14 @@ static inline int check_mxm_hw_tls(char *v, char *tls)
|
||||
{
|
||||
if (v && tls) {
|
||||
if ((0 == strcmp(tls, "rc") || 0 == strcmp(tls, "dc"))) {
|
||||
mca_spml_ikrit.ud_only = 0;
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
if (strstr(tls, "ud") &&
|
||||
(NULL == strstr(tls, "rc") && NULL == strstr(tls, "dc") &&
|
||||
NULL == strstr(tls, "shm"))) {
|
||||
mca_spml_ikrit.ud_only = 1;
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -140,8 +142,10 @@ static inline int set_mxm_hw_rdma_tls()
|
||||
if (!mca_spml_ikrit.hw_rdma_channel) {
|
||||
return check_mxm_hw_tls("MXM_OSHMEM_TLS", getenv("MXM_OSHMEM_TLS"));
|
||||
}
|
||||
setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0);
|
||||
setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0);
|
||||
opal_setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0, &environ);
|
||||
opal_setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0, &environ);
|
||||
SPML_VERBOSE(5, "Additional communication channel is enabled. Transports are: %s",
|
||||
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
|
||||
|
||||
return check_mxm_hw_tls("MXM_OSHMEM_HW_RDMA_TLS",
|
||||
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
|
||||
@ -295,6 +299,8 @@ static int mca_spml_ikrit_component_open(void)
|
||||
mca_spml_ikrit.ud_only = 1;
|
||||
mca_spml_ikrit.mxm_ctx_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA));
|
||||
#endif
|
||||
SPML_VERBOSE(5, "UD only mode is %s",
|
||||
mca_spml_ikrit.ud_only ? "enabled" : "disabled");
|
||||
|
||||
err = mxm_init(mca_spml_ikrit.mxm_ctx_opts, &mca_spml_ikrit.mxm_context);
|
||||
if (MXM_OK != err) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user