1
1

Merge pull request #1126 from alex-mikheev/topic/ikrit_err_fix

Topic/ikrit err fix
Этот коммит содержится в:
Mike Dubman 2015-11-11 15:31:06 +02:00
родитель 93847e4ca9 cd8ea438d3
Коммит 8ec5c99412
2 изменённых файлов: 60 добавлений и 51 удалений

Просмотреть файл

@ -22,6 +22,7 @@
#include "oshmem_config.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/mca/memchecker/base/base.h"
#include "orte/include/orte/types.h"
#include "orte/runtime/orte_globals.h"
#include "oshmem/mca/spml/ikrit/spml_ikrit.h"
@ -41,6 +42,19 @@
#define SPML_IKRIT_PUT_DEBUG 0
#endif
#define SPML_IKRIT_MXM_POST_SEND(sreq) \
do { \
mxm_error_t err; \
err = mxm_req_send(&sreq); \
if (MXM_OK != err) { \
SPML_ERROR("mxm_req_send (op=%d) failed: %s - aborting", \
sreq.opcode, \
mxm_error_string(err)); \
oshmem_shmem_abort(-1); \
return OSHMEM_ERROR; \
} \
} while(0)
typedef struct spml_ikrit_am_hdr {
uint64_t va;
} spml_ikrit_am_hdr_t;
@ -88,6 +102,7 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) {
}
#endif
static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req)
{
while (!mxm_req_test(req))
@ -99,11 +114,12 @@ static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request)
mca_spml_ikrit_put_request_t *put_req =
*(mca_spml_ikrit_put_request_t **) request;
assert(false == put_req->req_put.req_base.req_free_called);
OPAL_THREAD_LOCK(&oshmem_request_lock);
assert(false == put_req->req_put.req_base.req_free_called);
put_req->req_put.req_base.req_free_called = true;
opal_free_list_return (&mca_spml_base_put_requests,
(opal_free_list_item_t*)put_req);
opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req));
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
@ -147,11 +163,12 @@ static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request)
mca_spml_ikrit_get_request_t *get_req =
*(mca_spml_ikrit_get_request_t **) request;
assert(false == get_req->req_get.req_base.req_free_called);
OPAL_THREAD_LOCK(&oshmem_request_lock);
assert(false == get_req->req_get.req_base.req_free_called);
get_req->req_get.req_base.req_free_called = true;
opal_free_list_return (&mca_spml_base_get_requests,
(opal_free_list_item_t*)get_req);
opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req));
OPAL_THREAD_UNLOCK(&oshmem_request_lock);
*request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/
@ -167,7 +184,7 @@ static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request,
static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req)
{
req->req_get.req_base.req_type = MCA_SPML_REQUEST_PUT;
req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET;
req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free;
req->req_get.req_base.req_oshmem.req_cancel =
mca_spml_ikrit_get_request_cancel;
@ -245,6 +262,10 @@ static inline mca_spml_ikrit_put_request_t *alloc_put_req(void)
item = opal_free_list_wait (&mca_spml_base_put_requests);
req = (mca_spml_ikrit_put_request_t *) item;
opal_memchecker_base_mem_undefined(req, sizeof(*req));
opal_memchecker_base_mem_defined(&req->req_put.req_base,
sizeof(req->req_put.req_base));
req->req_put.req_base.req_free_called = false;
req->req_put.req_base.req_oshmem.req_complete = false;
@ -259,6 +280,10 @@ static inline mca_spml_ikrit_get_request_t *alloc_get_req(void)
item = opal_free_list_wait (&mca_spml_base_get_requests);
req = (mca_spml_ikrit_get_request_t *) item;
opal_memchecker_base_mem_undefined(req, sizeof(*req));
opal_memchecker_base_mem_defined(&req->req_get.req_base,
sizeof(req->req_get.req_base));
req->req_get.req_base.req_free_called = false;
req->req_get.req_base.req_oshmem.req_complete = false;
@ -363,19 +388,15 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs)
for (n = 0; n < nprocs; n++) {
i = (my_rank + n) % nprocs;
if (mca_spml_ikrit.mxm_peers[i]->mxm_conn) {
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
}
if (mca_spml_ikrit.hw_rdma_channel && mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn) {
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn);
if (mca_spml_ikrit.hw_rdma_channel) {
assert(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i]->mxm_conn);
mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn);
}
destroy_ptl_idx(i);
if (mca_spml_ikrit.mxm_peers[i]) {
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
}
OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]);
}
if (mca_spml_ikrit.mxm_peers)
free(mca_spml_ikrit.mxm_peers);
free(mca_spml_ikrit.mxm_peers);
return OSHMEM_SUCCESS;
}
@ -407,20 +428,18 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
}
memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t));
#endif
ep_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
if (NULL == ep_info) {
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
goto bail;
}
memset(ep_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
if (mca_spml_ikrit.hw_rdma_channel) {
ep_hw_rdma_info = malloc(nprocs * sizeof(spml_ikrit_mxm_ep_conn_info_t));
ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
if (NULL == ep_hw_rdma_info) {
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
goto bail;
}
memset(ep_hw_rdma_info, 0x0, sizeof(spml_ikrit_mxm_ep_conn_info_t));
}
mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs
@ -529,8 +548,10 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
/* Save returned connections */
for (i = 0; i < nprocs; ++i) {
mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn;
if (OSHMEM_SUCCESS != create_ptl_idx(i))
if (OSHMEM_SUCCESS != create_ptl_idx(i)) {
rc = OSHMEM_ERR_CONNECTION_FAILED;
goto bail;
}
mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]);
}
@ -559,7 +580,7 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
continue;
}
if (procs[i] == proc_self)
continue;
continue;
/* use zcopy for put/get via sysv shared memory */
procs[i]->transport_ids[0] = MXM_PTL_SHM;
@ -629,7 +650,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr,
#if MXM_API < MXM_VERSION(2,0)
mkeys[i].len = 0;
#else
if (mca_spml_ikrit.ud_only && !mca_spml_ikrit.hw_rdma_channel) {
if (mca_spml_ikrit.ud_only) {
mkeys[i].len = 0;
break;
}
@ -848,6 +869,7 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr,
int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
{
mxm_send_req_t sreq;
mxm_error_t err;
if (0 >= size) {
return OSHMEM_SUCCESS;
@ -873,10 +895,9 @@ int mca_spml_ikrit_get(void *src_addr, size_t size, void *dst_addr, int src)
#endif
sreq.base.completed_cb = NULL;
mxm_req_send(&sreq);
opal_progress();
mca_spml_irkit_req_wait(&sreq.base);
SPML_IKRIT_MXM_POST_SEND(sreq);
mca_spml_irkit_req_wait(&sreq.base);
if (MXM_OK != sreq.base.error) {
SPML_ERROR("get request failed: %s - aborting",
mxm_error_string(sreq.base.error));
@ -935,14 +956,8 @@ int mca_spml_ikrit_get_async(void *src_addr,
get_req->mxm_req.base.context = get_req;
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1);
mxm_req_send(&get_req->mxm_req);
SPML_IKRIT_MXM_POST_SEND(get_req->mxm_req);
if (MXM_OK != get_req->mxm_req.base.error) {
SPML_ERROR("get request failed: %s - aborting",
mxm_error_string(get_req->mxm_req.base.error));
oshmem_shmem_abort(-1);
return OSHMEM_ERROR;
}
return OSHMEM_SUCCESS;
}
@ -989,7 +1004,7 @@ static int mca_spml_ikrit_mxm_fence(int dst)
fence_req->mxm_req.base.context = fence_req;
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, 1);
mxm_req_send(&fence_req->mxm_req);
SPML_IKRIT_MXM_POST_SEND(fence_req->mxm_req);
return OSHMEM_SUCCESS;
}
@ -1162,15 +1177,8 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr,
mca_spml_ikrit.mxm_peers[dst]->n_active_puts++;
mxm_req_send(&put_req->mxm_req);
SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req);
if (MXM_OK != put_req->mxm_req.base.error) {
OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1);
SPML_ERROR("put request %p failed: %s - aborting",
(void*)put_req, mxm_error_string(put_req->mxm_req.base.error));
oshmem_shmem_abort(-1);
return OSHMEM_ERROR;
}
if (need_progress)
mxm_progress(mca_spml_ikrit.mxm_context);
@ -1269,13 +1277,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr,
mca_spml_ikrit.mxm_peers[dst]->need_fence = 1;
}
mxm_req_send(&mxm_req);
if (MXM_OK != mxm_req.base.error) {
SPML_ERROR("put request failed: %s(%d) - aborting",
mxm_error_string(mxm_req.base.error), mxm_req.base.error);
oshmem_shmem_abort(-1);
return OSHMEM_ERROR;
}
SPML_IKRIT_MXM_POST_SEND(mxm_req);
wait.req = &mxm_req.base;
wait.state = (mxm_req_state_t)(MXM_REQ_SENT | MXM_REQ_COMPLETED);
@ -1432,7 +1434,8 @@ int mca_spml_ikrit_send(void* buf,
req.base.data.buffer.length = size == 0 ? sizeof(dummy_buf) : size;
req.base.data.buffer.memh = NULL;
mxm_req_send(&req);
SPML_IKRIT_MXM_POST_SEND(req);
mca_spml_irkit_req_wait(&req.base);
if (req.base.error != MXM_OK) {
return OSHMEM_ERROR;

Просмотреть файл

@ -106,11 +106,11 @@ static inline int set_mxm_tls()
tls = getenv("MXM_TLS");
if (NULL == tls) {
setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1);
return OSHMEM_SUCCESS;
opal_setenv("MXM_OSHMEM_TLS", mca_spml_ikrit.mxm_tls, 1, &environ);
return check_mxm_tls("MXM_OSHMEM_TLS");
}
if (OSHMEM_SUCCESS == check_mxm_tls("MXM_TLS")) {
setenv("MXM_OSHMEM_TLS", tls, 1);
opal_setenv("MXM_OSHMEM_TLS", tls, 1, &environ);
return OSHMEM_SUCCESS;
}
return OSHMEM_ERROR;
@ -120,12 +120,14 @@ static inline int check_mxm_hw_tls(char *v, char *tls)
{
if (v && tls) {
if ((0 == strcmp(tls, "rc") || 0 == strcmp(tls, "dc"))) {
mca_spml_ikrit.ud_only = 0;
return OSHMEM_SUCCESS;
}
if (strstr(tls, "ud") &&
(NULL == strstr(tls, "rc") && NULL == strstr(tls, "dc") &&
NULL == strstr(tls, "shm"))) {
mca_spml_ikrit.ud_only = 1;
return OSHMEM_SUCCESS;
}
}
@ -140,8 +142,10 @@ static inline int set_mxm_hw_rdma_tls()
if (!mca_spml_ikrit.hw_rdma_channel) {
return check_mxm_hw_tls("MXM_OSHMEM_TLS", getenv("MXM_OSHMEM_TLS"));
}
setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0);
setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0);
opal_setenv("MXM_OSHMEM_HW_RDMA_RC_QP_LIMIT", "-1", 0, &environ);
opal_setenv("MXM_OSHMEM_HW_RDMA_TLS", "rc", 0, &environ);
SPML_VERBOSE(5, "Additional communication channel is enabled. Transports are: %s",
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
return check_mxm_hw_tls("MXM_OSHMEM_HW_RDMA_TLS",
getenv("MXM_OSHMEM_HW_RDMA_TLS"));
@ -295,6 +299,8 @@ static int mca_spml_ikrit_component_open(void)
mca_spml_ikrit.ud_only = 1;
mca_spml_ikrit.mxm_ctx_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA));
#endif
SPML_VERBOSE(5, "UD only mode is %s",
mca_spml_ikrit.ud_only ? "enabled" : "disabled");
err = mxm_init(mca_spml_ikrit.mxm_ctx_opts, &mca_spml_ikrit.mxm_context);
if (MXM_OK != err) {