Merge pull request #6163 from artpol84/osc/mt_submission
Refactoring of osc/ucx component for MT
Этот коммит содержится в:
Коммит
13a8e42108
@ -15,6 +15,7 @@
|
||||
#include "ompi/group/group.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "opal/mca/common/ucx/common_ucx.h"
|
||||
#include "opal/mca/common/ucx/common_ucx_wpool.h"
|
||||
|
||||
#define OSC_UCX_ASSERT MCA_COMMON_UCX_ASSERT
|
||||
#define OSC_UCX_ERROR MCA_COMMON_UCX_ERROR
|
||||
@ -22,18 +23,11 @@
|
||||
|
||||
#define OMPI_OSC_UCX_POST_PEER_MAX 32
|
||||
#define OMPI_OSC_UCX_ATTACH_MAX 32
|
||||
#define OMPI_OSC_UCX_RKEY_BUF_MAX 1024
|
||||
|
||||
typedef struct ompi_osc_ucx_win_info {
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t addr;
|
||||
bool rkey_init;
|
||||
} ompi_osc_ucx_win_info_t;
|
||||
#define OMPI_OSC_UCX_MEM_ADDR_MAX_LEN 1024
|
||||
|
||||
typedef struct ompi_osc_ucx_component {
|
||||
ompi_osc_base_component_t super;
|
||||
ucp_context_h ucp_context;
|
||||
ucp_worker_h ucp_worker;
|
||||
opal_common_ucx_wpool_t *wpool;
|
||||
bool enable_mpi_threads;
|
||||
opal_free_list_t requests; /* request free list for the r* communication variants */
|
||||
bool env_initialized; /* UCX environment is initialized or not */
|
||||
@ -62,7 +56,6 @@ typedef struct ompi_osc_ucx_epoch_type {
|
||||
#define TARGET_LOCK_EXCLUSIVE ((uint64_t)(0x0000000100000000ULL))
|
||||
|
||||
#define OSC_UCX_IOVEC_MAX 128
|
||||
#define OSC_UCX_OPS_THRESHOLD 1000000
|
||||
|
||||
#define OSC_UCX_STATE_LOCK_OFFSET 0
|
||||
#define OSC_UCX_STATE_REQ_FLAG_OFFSET sizeof(uint64_t)
|
||||
@ -75,11 +68,13 @@ typedef struct ompi_osc_ucx_epoch_type {
|
||||
typedef struct ompi_osc_dynamic_win_info {
|
||||
uint64_t base;
|
||||
size_t size;
|
||||
char rkey_buffer[OMPI_OSC_UCX_RKEY_BUF_MAX];
|
||||
char mem_addr[OMPI_OSC_UCX_MEM_ADDR_MAX_LEN];
|
||||
} ompi_osc_dynamic_win_info_t;
|
||||
|
||||
typedef struct ompi_osc_local_dynamic_win_info {
|
||||
ucp_mem_h memh;
|
||||
opal_common_ucx_wpmem_t *mem;
|
||||
char *my_mem_addr;
|
||||
int my_mem_addr_size;
|
||||
int refcnt;
|
||||
} ompi_osc_local_dynamic_win_info_t;
|
||||
|
||||
@ -97,12 +92,10 @@ typedef struct ompi_osc_ucx_state {
|
||||
typedef struct ompi_osc_ucx_module {
|
||||
ompi_osc_base_module_t super;
|
||||
struct ompi_communicator_t *comm;
|
||||
ucp_mem_h memh; /* remote accessible memory */
|
||||
int flavor;
|
||||
size_t size;
|
||||
ucp_mem_h state_memh;
|
||||
ompi_osc_ucx_win_info_t *win_info_array;
|
||||
ompi_osc_ucx_win_info_t *state_info_array;
|
||||
uint64_t *addrs;
|
||||
uint64_t *state_addrs;
|
||||
int disp_unit; /* if disp_unit >= 0, then everyone has the same
|
||||
* disp unit size; if disp_unit == -1, then we
|
||||
* need to look at disp_units */
|
||||
@ -117,11 +110,12 @@ typedef struct ompi_osc_ucx_module {
|
||||
opal_list_t pending_posts;
|
||||
int lock_count;
|
||||
int post_count;
|
||||
int global_ops_num;
|
||||
int *per_target_ops_nums;
|
||||
uint64_t req_result;
|
||||
int *start_grp_ranks;
|
||||
bool lock_all_is_nocheck;
|
||||
opal_common_ucx_ctx_t *ctx;
|
||||
opal_common_ucx_wpmem_t *mem;
|
||||
opal_common_ucx_wpmem_t *state_mem;
|
||||
} ompi_osc_ucx_module_t;
|
||||
|
||||
typedef enum locktype {
|
||||
@ -216,7 +210,4 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_
|
||||
int min_index, int max_index,
|
||||
uint64_t base, size_t len, int *insert);
|
||||
|
||||
void req_completion(void *request, ucs_status_t status);
|
||||
void internal_req_init(void *request);
|
||||
|
||||
#endif /* OMPI_OSC_UCX_H */
|
||||
|
@ -60,7 +60,7 @@ static inline void ompi_osc_ucx_handle_incoming_post(ompi_osc_ucx_module_t *modu
|
||||
|
||||
int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
int ret;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (module->epoch_type.access != NONE_EPOCH &&
|
||||
module->epoch_type.access != FENCE_EPOCH) {
|
||||
@ -74,16 +74,12 @@ int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) {
|
||||
}
|
||||
|
||||
if (!(assert & MPI_MODE_NOPRECEDE)) {
|
||||
ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
module->global_ops_num = 0;
|
||||
memset(module->per_target_ops_nums, 0,
|
||||
sizeof(int) * ompi_comm_size(module->comm));
|
||||
|
||||
return module->comm->c_coll->coll_barrier(module->comm,
|
||||
module->comm->c_coll->coll_barrier_module);
|
||||
}
|
||||
@ -147,7 +143,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t
|
||||
|
||||
ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[i]), ranks_in_win_grp, size);
|
||||
}
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
opal_common_ucx_wpool_progress(mca_osc_ucx_component.wpool);
|
||||
}
|
||||
|
||||
module->post_count = 0;
|
||||
@ -163,7 +159,6 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t
|
||||
|
||||
int ompi_osc_ucx_complete(struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucs_status_t status;
|
||||
int i, size;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
@ -173,29 +168,26 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) {
|
||||
|
||||
module->epoch_type.access = NONE_EPOCH;
|
||||
|
||||
ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
module->global_ops_num = 0;
|
||||
memset(module->per_target_ops_nums, 0,
|
||||
sizeof(int) * ompi_comm_size(module->comm));
|
||||
|
||||
size = ompi_group_size(module->start_group);
|
||||
for (i = 0; i < size; i++) {
|
||||
uint64_t remote_addr = (module->state_info_array)[module->start_grp_ranks[i]].addr + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; /* write to state.complete_count on remote side */
|
||||
ucp_rkey_h rkey = (module->state_info_array)[module->start_grp_ranks[i]].rkey;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, module->start_grp_ranks[i]);
|
||||
uint64_t remote_addr = module->state_addrs[module->start_grp_ranks[i]] + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; // write to state.complete_count on remote side
|
||||
|
||||
status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, 1,
|
||||
sizeof(uint64_t), remote_addr, rkey);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_atomic_post failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_post(module->mem, UCP_ATOMIC_POST_OP_ADD,
|
||||
1, module->start_grp_ranks[i], sizeof(uint64_t),
|
||||
remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_post failed: %d", ret);
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_ep_flush failed: %d", ret);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP,
|
||||
module->start_grp_ranks[i]);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,25 +235,29 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
uint64_t remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_INDEX_OFFSET; /* write to state.post_index on remote side */
|
||||
ucp_rkey_h rkey = (module->state_info_array)[ranks_in_win_grp[i]].rkey;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, ranks_in_win_grp[i]);
|
||||
uint64_t remote_addr = module->state_addrs[ranks_in_win_grp[i]] + OSC_UCX_STATE_POST_INDEX_OFFSET; // write to state.post_index on remote side
|
||||
uint64_t curr_idx = 0, result = 0;
|
||||
|
||||
/* do fop first to get an post index */
|
||||
opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1,
|
||||
&result, sizeof(result),
|
||||
remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_fetch(module->mem, UCP_ATOMIC_FETCH_OP_FADD,
|
||||
1, ranks_in_win_grp[i], &result,
|
||||
sizeof(result), remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
curr_idx = result & (OMPI_OSC_UCX_POST_PEER_MAX - 1);
|
||||
|
||||
remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_STATE_OFFSET + sizeof(uint64_t) * curr_idx;
|
||||
remote_addr = module->state_addrs[ranks_in_win_grp[i]] + OSC_UCX_STATE_POST_STATE_OFFSET + sizeof(uint64_t) * curr_idx;
|
||||
|
||||
/* do cas to send post message */
|
||||
do {
|
||||
opal_common_ucx_atomic_cswap(ep, 0, (uint64_t)myrank + 1, &result,
|
||||
sizeof(result), remote_addr, rkey,
|
||||
mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->mem, 0, result,
|
||||
myrank + 1, &result, sizeof(result),
|
||||
remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (result == 0)
|
||||
break;
|
||||
@ -302,7 +298,7 @@ int ompi_osc_ucx_wait(struct ompi_win_t *win) {
|
||||
|
||||
while (module->state.complete_count != (uint64_t)size) {
|
||||
/* not sure if this is required */
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
opal_common_ucx_wpool_progress(mca_osc_ucx_component.wpool);
|
||||
}
|
||||
|
||||
module->state.complete_count = 0;
|
||||
|
@ -66,23 +66,6 @@ static inline int check_sync_state(ompi_osc_ucx_module_t *module, int target,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int incr_and_check_ops_num(ompi_osc_ucx_module_t *module, int target,
|
||||
ucp_ep_h ep) {
|
||||
int status;
|
||||
|
||||
module->global_ops_num++;
|
||||
module->per_target_ops_nums[target]++;
|
||||
if (module->global_ops_num >= OSC_UCX_OPS_THRESHOLD) {
|
||||
status = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
if (status != OMPI_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
module->global_ops_num -= module->per_target_ops_nums[target];
|
||||
module->per_target_ops_nums[target] = 0;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int create_iov_list(const void *addr, int count, ompi_datatype_t *datatype,
|
||||
ucx_iovec_t **ucx_iov, uint32_t *ucx_iov_count) {
|
||||
int ret = OMPI_SUCCESS;
|
||||
@ -137,13 +120,13 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module,
|
||||
const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
bool is_origin_contig, ptrdiff_t origin_lb,
|
||||
int target, ucp_ep_h ep, uint64_t remote_addr, ucp_rkey_h rkey,
|
||||
int target, uint64_t remote_addr,
|
||||
int target_count, struct ompi_datatype_t *target_dt,
|
||||
bool is_target_contig, ptrdiff_t target_lb, bool is_get) {
|
||||
ucx_iovec_t *origin_ucx_iov = NULL, *target_ucx_iov = NULL;
|
||||
uint32_t origin_ucx_iov_count = 0, target_ucx_iov_count = 0;
|
||||
uint32_t origin_ucx_iov_idx = 0, target_ucx_iov_idx = 0;
|
||||
ucs_status_t status;
|
||||
int status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (!is_origin_contig) {
|
||||
@ -164,29 +147,21 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module,
|
||||
|
||||
if (!is_origin_contig && !is_target_contig) {
|
||||
size_t curr_len = 0;
|
||||
opal_common_ucx_op_t op;
|
||||
while (origin_ucx_iov_idx < origin_ucx_iov_count) {
|
||||
curr_len = MIN(origin_ucx_iov[origin_ucx_iov_idx].len,
|
||||
target_ucx_iov[target_ucx_iov_idx].len);
|
||||
|
||||
if (!is_get) {
|
||||
status = ucp_put_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (is_get) {
|
||||
op = OPAL_COMMON_UCX_GET;
|
||||
} else {
|
||||
status = ucp_get_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d",status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
op = OPAL_COMMON_UCX_PUT;
|
||||
}
|
||||
|
||||
ret = incr_and_check_ops_num(module, target, ep);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
status = opal_common_ucx_wpmem_putget(module->mem, op, target,
|
||||
origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr));
|
||||
if (OPAL_SUCCESS != status) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
origin_ucx_iov[origin_ucx_iov_idx].addr = (void *)((intptr_t)origin_ucx_iov[origin_ucx_iov_idx].addr + curr_len);
|
||||
@ -207,28 +182,20 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module,
|
||||
|
||||
} else if (!is_origin_contig) {
|
||||
size_t prev_len = 0;
|
||||
opal_common_ucx_op_t op;
|
||||
while (origin_ucx_iov_idx < origin_ucx_iov_count) {
|
||||
if (!is_get) {
|
||||
status = ucp_put_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr,
|
||||
origin_ucx_iov[origin_ucx_iov_idx].len,
|
||||
remote_addr + target_lb + prev_len, rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (is_get) {
|
||||
op = OPAL_COMMON_UCX_GET;
|
||||
} else {
|
||||
status = ucp_get_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr,
|
||||
origin_ucx_iov[origin_ucx_iov_idx].len,
|
||||
remote_addr + target_lb + prev_len, rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
op = OPAL_COMMON_UCX_PUT;
|
||||
}
|
||||
|
||||
ret = incr_and_check_ops_num(module, target, ep);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
status = opal_common_ucx_wpmem_putget(module->mem, op, target,
|
||||
origin_ucx_iov[origin_ucx_iov_idx].addr,
|
||||
origin_ucx_iov[origin_ucx_iov_idx].len,
|
||||
remote_addr + target_lb + prev_len);
|
||||
if (OPAL_SUCCESS != status) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
prev_len += origin_ucx_iov[origin_ucx_iov_idx].len;
|
||||
@ -236,28 +203,21 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module,
|
||||
}
|
||||
} else {
|
||||
size_t prev_len = 0;
|
||||
opal_common_ucx_op_t op;
|
||||
while (target_ucx_iov_idx < target_ucx_iov_count) {
|
||||
if (!is_get) {
|
||||
status = ucp_put_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb + prev_len),
|
||||
target_ucx_iov[target_ucx_iov_idx].len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (is_get) {
|
||||
op = OPAL_COMMON_UCX_GET;
|
||||
} else {
|
||||
status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb + prev_len),
|
||||
target_ucx_iov[target_ucx_iov_idx].len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
op = OPAL_COMMON_UCX_PUT;
|
||||
}
|
||||
|
||||
ret = incr_and_check_ops_num(module, target, ep);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
status = opal_common_ucx_wpmem_putget(module->mem, op, target,
|
||||
(void *)((intptr_t)origin_addr + origin_lb + prev_len),
|
||||
target_ucx_iov[target_ucx_iov_idx].len,
|
||||
remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr));
|
||||
if (OPAL_SUCCESS != status) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
prev_len += target_ucx_iov[target_ucx_iov_idx].len;
|
||||
@ -275,68 +235,63 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) {
|
||||
static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t result_value = -1;
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey,
|
||||
mca_osc_ucx_component.ucp_worker);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
|
||||
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
target, &result_value, sizeof(result_value),
|
||||
remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) {
|
||||
static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t result_value = 0;
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
int ret;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
ret = opal_common_ucx_wpmem_fetch(module->state_mem,
|
||||
UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED,
|
||||
target, &result_value, sizeof(result_value),
|
||||
remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fetch failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
assert(result_value == TARGET_LOCK_EXCLUSIVE);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module_t *module,
|
||||
ucp_ep_h ep, int target) {
|
||||
ucp_rkey_h state_rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_state_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_DYNAMIC_WIN_CNT_OFFSET;
|
||||
int target) {
|
||||
uint64_t remote_state_addr = (module->state_addrs)[target] + OSC_UCX_STATE_DYNAMIC_WIN_CNT_OFFSET;
|
||||
size_t len = sizeof(uint64_t) + sizeof(ompi_osc_dynamic_win_info_t) * OMPI_OSC_UCX_ATTACH_MAX;
|
||||
char *temp_buf = malloc(len);
|
||||
ompi_osc_dynamic_win_info_t *temp_dynamic_wins;
|
||||
uint64_t win_count;
|
||||
int contain, insert = -1;
|
||||
ucs_status_t status;
|
||||
int ret;
|
||||
|
||||
if ((module->win_info_array[target]).rkey_init == true) {
|
||||
ucp_rkey_destroy((module->win_info_array[target]).rkey);
|
||||
(module->win_info_array[target]).rkey_init = false;
|
||||
}
|
||||
|
||||
status = ucp_get_nbi(ep, (void *)temp_buf, len, remote_state_addr, state_rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_putget(module->state_mem, OPAL_COMMON_UCX_GET, target,
|
||||
(void *)((intptr_t)temp_buf),
|
||||
len, remote_state_addr);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->state_mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -349,30 +304,29 @@ static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module
|
||||
remote_addr, 1, &insert);
|
||||
assert(contain >= 0 && (uint64_t)contain < win_count);
|
||||
|
||||
status = ucp_ep_rkey_unpack(ep, temp_dynamic_wins[contain].rkey_buffer,
|
||||
&((module->win_info_array[target]).rkey));
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
if (module->local_dynamic_win_info[contain].mem->mem_addrs == NULL) {
|
||||
module->local_dynamic_win_info[contain].mem->mem_addrs = calloc(ompi_comm_size(module->comm),
|
||||
OMPI_OSC_UCX_MEM_ADDR_MAX_LEN);
|
||||
module->local_dynamic_win_info[contain].mem->mem_displs =calloc(ompi_comm_size(module->comm),
|
||||
sizeof(int));
|
||||
}
|
||||
|
||||
(module->win_info_array[target]).rkey_init = true;
|
||||
memcpy(module->local_dynamic_win_info[contain].mem->mem_addrs + target * OMPI_OSC_UCX_MEM_ADDR_MAX_LEN,
|
||||
temp_dynamic_wins[contain].mem_addr, OMPI_OSC_UCX_MEM_ADDR_MAX_LEN);
|
||||
module->local_dynamic_win_info[contain].mem->mem_displs[target] = target * OMPI_OSC_UCX_MEM_ADDR_MAX_LEN;
|
||||
|
||||
free(temp_buf);
|
||||
|
||||
return status;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
|
||||
int target, ptrdiff_t target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
bool is_origin_contig = false, is_target_contig = false;
|
||||
ptrdiff_t origin_lb, origin_extent, target_lb, target_extent;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, false);
|
||||
@ -381,20 +335,16 @@ int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_data
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_VALID_RKEY(module, target, target_count);
|
||||
|
||||
if (!target_count) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent);
|
||||
ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent);
|
||||
|
||||
@ -408,16 +358,17 @@ int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_data
|
||||
ompi_datatype_type_size(origin_dt, &origin_len);
|
||||
origin_len *= origin_count;
|
||||
|
||||
status = ucp_put_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len,
|
||||
remote_addr + target_lb, rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_putget(module->mem, OPAL_COMMON_UCX_PUT, target,
|
||||
(void *)((intptr_t)origin_addr + origin_lb),
|
||||
origin_len, remote_addr + target_lb);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
return incr_and_check_ops_num(module, target, ep);
|
||||
return ret;
|
||||
} else {
|
||||
return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig,
|
||||
origin_lb, target, ep, remote_addr, rkey, target_count, target_dt,
|
||||
origin_lb, target, remote_addr, target_count, target_dt,
|
||||
is_target_contig, target_lb, false);
|
||||
}
|
||||
}
|
||||
@ -427,12 +378,9 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count,
|
||||
int target, ptrdiff_t target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ptrdiff_t origin_lb, origin_extent, target_lb, target_extent;
|
||||
bool is_origin_contig = false, is_target_contig = false;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, false);
|
||||
@ -441,19 +389,16 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count,
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_VALID_RKEY(module, target, target_count);
|
||||
|
||||
if (!target_count) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent);
|
||||
ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent);
|
||||
@ -468,17 +413,18 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count,
|
||||
ompi_datatype_type_size(origin_dt, &origin_len);
|
||||
origin_len *= origin_count;
|
||||
|
||||
status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len,
|
||||
remote_addr + target_lb, rkey);
|
||||
if (status != UCS_OK && status != UCS_INPROGRESS) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_putget(module->mem, OPAL_COMMON_UCX_GET, target,
|
||||
(void *)((intptr_t)origin_addr + origin_lb),
|
||||
origin_len, remote_addr + target_lb);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return incr_and_check_ops_num(module, target, ep);
|
||||
return ret;
|
||||
} else {
|
||||
return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig,
|
||||
origin_lb, target, ep, remote_addr, rkey, target_count, target_dt,
|
||||
origin_lb, target, remote_addr, target_count, target_dt,
|
||||
is_target_contig, target_lb, true);
|
||||
}
|
||||
}
|
||||
@ -489,7 +435,6 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *target_dt,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, false);
|
||||
@ -501,7 +446,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = start_atomicity(module, ep, target);
|
||||
ret = start_atomicity(module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -541,7 +486,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -595,7 +540,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -603,9 +548,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count,
|
||||
free(temp_addr_holder);
|
||||
}
|
||||
|
||||
ret = end_atomicity(module, ep, target);
|
||||
|
||||
return ret;
|
||||
return end_atomicity(module, target);
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_addr,
|
||||
@ -613,47 +556,36 @@ int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_a
|
||||
int target, ptrdiff_t target_disp,
|
||||
struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
size_t dt_bytes;
|
||||
ompi_osc_ucx_internal_request_t *req = NULL;
|
||||
int ret = OMPI_SUCCESS;
|
||||
ucs_status_t status;
|
||||
|
||||
ret = check_sync_state(module, target, false);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = start_atomicity(module, ep, target);
|
||||
ret = start_atomicity(module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
ompi_datatype_type_size(dt, &dt_bytes);
|
||||
memcpy(result_addr, origin_addr, dt_bytes);
|
||||
req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_CSWAP, *(uint64_t *)compare_addr,
|
||||
result_addr, dt_bytes, remote_addr, rkey, req_completion);
|
||||
if (UCS_PTR_IS_PTR(req)) {
|
||||
ucp_request_release(req);
|
||||
}
|
||||
|
||||
ret = incr_and_check_ops_num(module, target, ep);
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->mem,*(uint64_t *)compare_addr,
|
||||
*(uint64_t *)origin_addr, target,
|
||||
result_addr, dt_bytes, remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return end_atomicity(module, ep, target);
|
||||
return end_atomicity(module, target);
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr,
|
||||
@ -670,29 +602,23 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr,
|
||||
|
||||
if (op == &ompi_mpi_op_no_op.op || op == &ompi_mpi_op_replace.op ||
|
||||
op == &ompi_mpi_op_sum.op) {
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
uint64_t value = *(uint64_t *)origin_addr;
|
||||
ucp_atomic_fetch_op_t opcode;
|
||||
size_t dt_bytes;
|
||||
ompi_osc_ucx_internal_request_t *req = NULL;
|
||||
ucs_status_t status;
|
||||
|
||||
ret = start_atomicity(module, ep, target);
|
||||
ret = start_atomicity(module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
ompi_datatype_type_size(dt, &dt_bytes);
|
||||
|
||||
if (op == &ompi_mpi_op_replace.op) {
|
||||
@ -704,18 +630,13 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr,
|
||||
}
|
||||
}
|
||||
|
||||
req = ucp_atomic_fetch_nb(ep, opcode, value, result_addr,
|
||||
dt_bytes, remote_addr, rkey, req_completion);
|
||||
if (UCS_PTR_IS_PTR(req)) {
|
||||
ucp_request_release(req);
|
||||
}
|
||||
|
||||
ret = incr_and_check_ops_num(module, target, ep);
|
||||
ret = opal_common_ucx_wpmem_fetch(module->mem, opcode, value, target,
|
||||
(void *)origin_addr, dt_bytes, remote_addr);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return end_atomicity(module, ep, target);
|
||||
return end_atomicity(module, target);
|
||||
} else {
|
||||
return ompi_osc_ucx_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt,
|
||||
target, target_disp, 1, dt, op, win);
|
||||
@ -730,7 +651,6 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
|
||||
int target_count, struct ompi_datatype_t *target_dt,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, false);
|
||||
@ -738,7 +658,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = start_atomicity(module, ep, target);
|
||||
ret = start_atomicity(module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -786,7 +706,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -839,7 +759,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -848,9 +768,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count,
|
||||
}
|
||||
}
|
||||
|
||||
ret = end_atomicity(module, ep, target);
|
||||
|
||||
return ret;
|
||||
return end_atomicity(module, target);
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_rput(const void *origin_addr, int origin_count,
|
||||
@ -859,12 +777,8 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *target_dt,
|
||||
struct ompi_win_t *win, struct ompi_request_t **request) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET;
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ompi_osc_ucx_request_t *ucx_req = NULL;
|
||||
ompi_osc_ucx_internal_request_t *internal_req = NULL;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, true);
|
||||
@ -873,16 +787,12 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count,
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_VALID_RKEY(module, target, target_count);
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
OMPI_OSC_UCX_REQUEST_ALLOC(win, ucx_req);
|
||||
assert(NULL != ucx_req);
|
||||
|
||||
@ -892,26 +802,24 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_fence(module->mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
internal_req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_FADD, 0,
|
||||
&(module->req_result), sizeof(uint64_t),
|
||||
remote_addr, rkey, req_completion);
|
||||
|
||||
if (UCS_PTR_IS_PTR(internal_req)) {
|
||||
internal_req->external_req = ucx_req;
|
||||
mca_osc_ucx_component.num_incomplete_req_ops++;
|
||||
} else {
|
||||
ompi_request_complete(&ucx_req->super, true);
|
||||
mca_osc_ucx_component.num_incomplete_req_ops++;
|
||||
ret = opal_common_ucx_wpmem_fetch_nb(module->mem, UCP_ATOMIC_FETCH_OP_FADD,
|
||||
0, target, &(module->req_result),
|
||||
sizeof(uint64_t), remote_addr,
|
||||
req_completion, ucx_req);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = &ucx_req->super;
|
||||
|
||||
return incr_and_check_ops_num(module, target, ep);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_rget(void *origin_addr, int origin_count,
|
||||
@ -920,12 +828,8 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET;
|
||||
ucp_rkey_h rkey;
|
||||
uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target);
|
||||
ompi_osc_ucx_request_t *ucx_req = NULL;
|
||||
ompi_osc_ucx_internal_request_t *internal_req = NULL;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = check_sync_state(module, target, true);
|
||||
@ -934,16 +838,12 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count,
|
||||
}
|
||||
|
||||
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
status = get_dynamic_win_info(remote_addr, module, ep, target);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = get_dynamic_win_info(remote_addr, module, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK_VALID_RKEY(module, target, target_count);
|
||||
|
||||
rkey = (module->win_info_array[target]).rkey;
|
||||
|
||||
OMPI_OSC_UCX_REQUEST_ALLOC(win, ucx_req);
|
||||
assert(NULL != ucx_req);
|
||||
|
||||
@ -953,26 +853,24 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count,
|
||||
return ret;
|
||||
}
|
||||
|
||||
status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status);
|
||||
ret = opal_common_ucx_wpmem_fence(module->mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
internal_req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_FADD, 0,
|
||||
&(module->req_result), sizeof(uint64_t),
|
||||
remote_addr, rkey, req_completion);
|
||||
|
||||
if (UCS_PTR_IS_PTR(internal_req)) {
|
||||
internal_req->external_req = ucx_req;
|
||||
mca_osc_ucx_component.num_incomplete_req_ops++;
|
||||
} else {
|
||||
ompi_request_complete(&ucx_req->super, true);
|
||||
mca_osc_ucx_component.num_incomplete_req_ops++;
|
||||
ret = opal_common_ucx_wpmem_fetch_nb(module->mem, UCP_ATOMIC_FETCH_OP_FADD,
|
||||
0, target, &(module->req_result),
|
||||
sizeof(uint64_t), remote_addr,
|
||||
req_completion, ucx_req);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = &ucx_req->super;
|
||||
|
||||
return incr_and_check_ops_num(module, target, ep);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count,
|
||||
|
@ -56,8 +56,7 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = {
|
||||
.osc_select = component_select,
|
||||
.osc_finalize = component_finalize,
|
||||
},
|
||||
.ucp_context = NULL,
|
||||
.ucp_worker = NULL,
|
||||
.wpool = NULL,
|
||||
.env_initialized = false,
|
||||
.num_incomplete_req_ops = 0,
|
||||
.num_modules = 0
|
||||
@ -129,37 +128,22 @@ static int component_register(void) {
|
||||
}
|
||||
|
||||
static int progress_callback(void) {
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
if (mca_osc_ucx_component.wpool != NULL) {
|
||||
opal_common_ucx_wpool_progress(mca_osc_ucx_component.wpool);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
|
||||
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
|
||||
|
||||
mca_osc_ucx_component.wpool = opal_common_ucx_wpool_allocate();
|
||||
opal_common_ucx_mca_register();
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int component_finalize(void) {
|
||||
int i;
|
||||
for (i = 0; i < ompi_proc_world_size(); i++) {
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(&(ompi_mpi_comm_world.comm), i);
|
||||
if (ep != NULL) {
|
||||
ucp_ep_destroy(ep);
|
||||
}
|
||||
}
|
||||
|
||||
if (mca_osc_ucx_component.ucp_worker != NULL) {
|
||||
ucp_worker_destroy(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
|
||||
assert(mca_osc_ucx_component.num_incomplete_req_ops == 0);
|
||||
if (mca_osc_ucx_component.env_initialized == true) {
|
||||
OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
|
||||
ucp_cleanup(mca_osc_ucx_component.ucp_context);
|
||||
mca_osc_ucx_component.env_initialized = false;
|
||||
}
|
||||
opal_common_ucx_mca_deregister();
|
||||
opal_common_ucx_wpool_free(mca_osc_ucx_component.wpool);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -169,9 +153,11 @@ static int component_query(struct ompi_win_t *win, void **base, size_t size, int
|
||||
return mca_osc_ucx_component.priority;
|
||||
}
|
||||
|
||||
static inline int allgather_len_and_info(void *my_info, int my_info_len, char **recv_info,
|
||||
int *disps, struct ompi_communicator_t *comm) {
|
||||
static int exchange_len_info(void *my_info, size_t my_info_len, char **recv_info_ptr,
|
||||
int **disps_ptr, void *metadata)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
struct ompi_communicator_t *comm = (struct ompi_communicator_t *)metadata;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
int lens[comm_size];
|
||||
int total_len, i;
|
||||
@ -184,15 +170,15 @@ static inline int allgather_len_and_info(void *my_info, int my_info_len, char **
|
||||
}
|
||||
|
||||
total_len = 0;
|
||||
(*disps_ptr) = (int *)calloc(comm_size, sizeof(int));
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
disps[i] = total_len;
|
||||
(*disps_ptr)[i] = total_len;
|
||||
total_len += lens[i];
|
||||
}
|
||||
|
||||
(*recv_info) = (char *)malloc(total_len);
|
||||
|
||||
(*recv_info_ptr) = (char *)calloc(total_len, sizeof(char));
|
||||
ret = comm->c_coll->coll_allgatherv(my_info, my_info_len, MPI_BYTE,
|
||||
(void *)(*recv_info), lens, disps, MPI_BYTE,
|
||||
(void *)(*recv_info_ptr), lens, (*disps_ptr), MPI_BYTE,
|
||||
comm, comm->c_coll->coll_allgatherv_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
@ -201,60 +187,6 @@ static inline int allgather_len_and_info(void *my_info, int my_info_len, char **
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int mem_map(void **base, size_t size, ucp_mem_h *memh_ptr,
|
||||
ompi_osc_ucx_module_t *module, int flavor) {
|
||||
ucp_mem_map_params_t mem_params;
|
||||
ucp_mem_attr_t mem_attrs;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (!(flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE)
|
||||
|| size == 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
memset(&mem_params, 0, sizeof(ucp_mem_map_params_t));
|
||||
mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
|
||||
UCP_MEM_MAP_PARAM_FIELD_LENGTH |
|
||||
UCP_MEM_MAP_PARAM_FIELD_FLAGS;
|
||||
mem_params.length = size;
|
||||
if (flavor == MPI_WIN_FLAVOR_ALLOCATE) {
|
||||
mem_params.address = NULL;
|
||||
mem_params.flags = UCP_MEM_MAP_ALLOCATE;
|
||||
} else {
|
||||
mem_params.address = (*base);
|
||||
}
|
||||
|
||||
/* memory map */
|
||||
|
||||
status = ucp_mem_map(mca_osc_ucx_component.ucp_context, &mem_params, memh_ptr);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_mem_map failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
|
||||
mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH;
|
||||
status = ucp_mem_query((*memh_ptr), &mem_attrs);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_mem_query failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
|
||||
assert(mem_attrs.length >= size);
|
||||
if (flavor == MPI_WIN_FLAVOR_CREATE) {
|
||||
assert(mem_attrs.address == (*base));
|
||||
} else {
|
||||
(*base) = mem_attrs.address;
|
||||
}
|
||||
|
||||
return ret;
|
||||
error:
|
||||
ucp_mem_unmap(mca_osc_ucx_component.ucp_context, (*memh_ptr));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ompi_osc_ucx_unregister_progress()
|
||||
{
|
||||
int ret;
|
||||
@ -276,23 +208,16 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
char *name = NULL;
|
||||
long values[2];
|
||||
int ret = OMPI_SUCCESS;
|
||||
ucs_status_t status;
|
||||
//ucs_status_t status;
|
||||
int i, comm_size = ompi_comm_size(comm);
|
||||
int is_eps_ready;
|
||||
bool eps_created = false, env_initialized = false;
|
||||
ucp_address_t *my_addr = NULL;
|
||||
size_t my_addr_len;
|
||||
char *recv_buf = NULL;
|
||||
void *rkey_buffer = NULL, *state_rkey_buffer = NULL;
|
||||
size_t rkey_buffer_size, state_rkey_buffer_size;
|
||||
bool env_initialized = false;
|
||||
void *state_base = NULL;
|
||||
void * my_info = NULL;
|
||||
size_t my_info_len;
|
||||
int disps[comm_size];
|
||||
int rkey_sizes[comm_size];
|
||||
opal_common_ucx_mem_type_t mem_type;
|
||||
uint64_t zero = 0;
|
||||
size_t info_offset;
|
||||
uint64_t size_u64;
|
||||
char *my_mem_addr;
|
||||
int my_mem_addr_size;
|
||||
void * my_info = NULL;
|
||||
char *recv_buf = NULL;
|
||||
|
||||
/* the osc/sm component is the exclusive provider for support for
|
||||
* shared memory windows */
|
||||
@ -301,16 +226,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
}
|
||||
|
||||
if (mca_osc_ucx_component.env_initialized == false) {
|
||||
ucp_config_t *config = NULL;
|
||||
ucp_params_t context_params;
|
||||
ucp_worker_params_t worker_params;
|
||||
ucp_worker_attr_t worker_attr;
|
||||
|
||||
status = ucp_config_read("MPI", NULL, &config);
|
||||
if (UCS_OK != status) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&mca_osc_ucx_component.requests, opal_free_list_t);
|
||||
ret = opal_free_list_init (&mca_osc_ucx_component.requests,
|
||||
@ -323,57 +238,14 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* initialize UCP context */
|
||||
|
||||
memset(&context_params, 0, sizeof(context_params));
|
||||
context_params.field_mask = UCP_PARAM_FIELD_FEATURES |
|
||||
UCP_PARAM_FIELD_MT_WORKERS_SHARED |
|
||||
UCP_PARAM_FIELD_ESTIMATED_NUM_EPS |
|
||||
UCP_PARAM_FIELD_REQUEST_INIT |
|
||||
UCP_PARAM_FIELD_REQUEST_SIZE;
|
||||
context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64;
|
||||
context_params.mt_workers_shared = 0;
|
||||
context_params.estimated_num_eps = ompi_proc_world_size();
|
||||
context_params.request_init = internal_req_init;
|
||||
context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t);
|
||||
|
||||
status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context);
|
||||
ucp_config_release(config);
|
||||
if (UCS_OK != status) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
ret = opal_common_ucx_wpool_init(mca_osc_ucx_component.wpool,
|
||||
ompi_proc_world_size(),
|
||||
mca_osc_ucx_component.enable_mpi_threads);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_wpool_init failed: %d", ret);
|
||||
goto error;
|
||||
}
|
||||
|
||||
assert(mca_osc_ucx_component.ucp_worker == NULL);
|
||||
memset(&worker_params, 0, sizeof(worker_params));
|
||||
worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE;
|
||||
worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true)
|
||||
? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE;
|
||||
status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params,
|
||||
&(mca_osc_ucx_component.ucp_worker));
|
||||
if (UCS_OK != status) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_create failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error_nomem;
|
||||
}
|
||||
|
||||
/* query UCP worker attributes */
|
||||
worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE;
|
||||
status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr);
|
||||
if (UCS_OK != status) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_query failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error_nomem;
|
||||
}
|
||||
|
||||
if (mca_osc_ucx_component.enable_mpi_threads == true &&
|
||||
worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) {
|
||||
OSC_UCX_VERBOSE(1, "ucx does not support multithreading");
|
||||
ret = OMPI_ERROR;
|
||||
goto error_nomem;
|
||||
}
|
||||
|
||||
mca_osc_ucx_component.env_initialized = true;
|
||||
env_initialized = true;
|
||||
}
|
||||
@ -434,187 +306,76 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
}
|
||||
}
|
||||
|
||||
/* exchange endpoints if necessary */
|
||||
is_eps_ready = 1;
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
if (OSC_UCX_GET_EP(module->comm, i) == NULL) {
|
||||
is_eps_ready = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT,
|
||||
MPI_LAND,
|
||||
module->comm,
|
||||
module->comm->c_coll->coll_allreduce_module);
|
||||
ret = opal_common_ucx_wpctx_create(mca_osc_ucx_component.wpool, comm_size,
|
||||
&exchange_len_info, (void *)module->comm,
|
||||
&module->ctx);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (!is_eps_ready) {
|
||||
status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker,
|
||||
&my_addr, &my_addr_len);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_get_address failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
if (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) {
|
||||
switch (flavor) {
|
||||
case MPI_WIN_FLAVOR_ALLOCATE:
|
||||
mem_type = OPAL_COMMON_UCX_MEM_ALLOCATE_MAP;
|
||||
break;
|
||||
case MPI_WIN_FLAVOR_CREATE:
|
||||
mem_type = OPAL_COMMON_UCX_MEM_MAP;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = allgather_len_and_info(my_addr, (int)my_addr_len,
|
||||
&recv_buf, disps, module->comm);
|
||||
ret = opal_common_ucx_wpmem_create(module->ctx, base, size,
|
||||
mem_type, &exchange_len_info,
|
||||
(void *)module->comm,
|
||||
&my_mem_addr, &my_mem_addr_size,
|
||||
&module->mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
if (OSC_UCX_GET_EP(module->comm, i) == NULL) {
|
||||
ucp_ep_params_t ep_params;
|
||||
ucp_ep_h ep;
|
||||
memset(&ep_params, 0, sizeof(ucp_ep_params_t));
|
||||
ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
|
||||
ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]);
|
||||
status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
|
||||
ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep;
|
||||
}
|
||||
}
|
||||
|
||||
ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr);
|
||||
my_addr = NULL;
|
||||
free(recv_buf);
|
||||
recv_buf = NULL;
|
||||
|
||||
eps_created = true;
|
||||
}
|
||||
|
||||
ret = mem_map(base, size, &(module->memh), module, flavor);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
state_base = (void *)&(module->state);
|
||||
ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh),
|
||||
module, MPI_WIN_FLAVOR_CREATE);
|
||||
ret = opal_common_ucx_wpmem_create(module->ctx, &state_base,
|
||||
sizeof(ompi_osc_ucx_state_t),
|
||||
OPAL_COMMON_UCX_MEM_MAP, &exchange_len_info,
|
||||
(void *)module->comm,
|
||||
&my_mem_addr, &my_mem_addr_size,
|
||||
&module->state_mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t));
|
||||
if (module->win_info_array == NULL) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
|
||||
module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t));
|
||||
if (module->state_info_array == NULL) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (size > 0 && (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE)) {
|
||||
status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh,
|
||||
&rkey_buffer, &rkey_buffer_size);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
rkey_buffer_size = 0;
|
||||
}
|
||||
|
||||
status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh,
|
||||
&state_rkey_buffer, &state_rkey_buffer_size);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
|
||||
size_u64 = (uint64_t)size;
|
||||
my_info_len = 3 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size;
|
||||
my_info = malloc(my_info_len);
|
||||
/* exchange window addrs */
|
||||
my_info = malloc(2 * sizeof(uint64_t));
|
||||
if (my_info == NULL) {
|
||||
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
|
||||
info_offset = 0;
|
||||
|
||||
if (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) {
|
||||
memcpy_off(my_info, base, sizeof(uint64_t), info_offset);
|
||||
memcpy(my_info, base, sizeof(uint64_t));
|
||||
} else {
|
||||
memcpy_off(my_info, &zero, sizeof(uint64_t), info_offset);
|
||||
memcpy(my_info, &zero, sizeof(uint64_t));
|
||||
}
|
||||
memcpy_off(my_info, &state_base, sizeof(uint64_t), info_offset);
|
||||
memcpy_off(my_info, &size_u64, sizeof(uint64_t), info_offset);
|
||||
memcpy_off(my_info, rkey_buffer, rkey_buffer_size, info_offset);
|
||||
memcpy_off(my_info, state_rkey_buffer, state_rkey_buffer_size, info_offset);
|
||||
memcpy((char*)my_info + sizeof(uint64_t), &state_base, sizeof(uint64_t));
|
||||
|
||||
assert(my_info_len == info_offset);
|
||||
|
||||
ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm);
|
||||
recv_buf = (char *)calloc(comm_size, 2 * sizeof(uint64_t));
|
||||
ret = comm->c_coll->coll_allgather((void *)my_info, 2 * sizeof(uint64_t),
|
||||
MPI_BYTE, recv_buf, 2 * sizeof(uint64_t),
|
||||
MPI_BYTE, comm, comm->c_coll->coll_allgather_module);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT,
|
||||
rkey_sizes, 1, MPI_INT, comm,
|
||||
comm->c_coll->coll_allgather_module);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
module->addrs = calloc(comm_size, sizeof(uint64_t));
|
||||
module->state_addrs = calloc(comm_size, sizeof(uint64_t));
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i);
|
||||
uint64_t dest_size;
|
||||
assert(ep != NULL);
|
||||
|
||||
info_offset = disps[i];
|
||||
|
||||
memcpy(&(module->win_info_array[i]).addr, &recv_buf[info_offset], sizeof(uint64_t));
|
||||
info_offset += sizeof(uint64_t);
|
||||
memcpy(&(module->state_info_array[i]).addr, &recv_buf[info_offset], sizeof(uint64_t));
|
||||
info_offset += sizeof(uint64_t);
|
||||
memcpy(&dest_size, &recv_buf[info_offset], sizeof(uint64_t));
|
||||
info_offset += sizeof(uint64_t);
|
||||
|
||||
(module->win_info_array[i]).rkey_init = false;
|
||||
if (dest_size > 0 && (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE)) {
|
||||
status = ucp_ep_rkey_unpack(ep, &recv_buf[info_offset],
|
||||
&((module->win_info_array[i]).rkey));
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
info_offset += rkey_sizes[i];
|
||||
(module->win_info_array[i]).rkey_init = true;
|
||||
}
|
||||
|
||||
status = ucp_ep_rkey_unpack(ep, &recv_buf[info_offset],
|
||||
&((module->state_info_array[i]).rkey));
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
}
|
||||
(module->state_info_array[i]).rkey_init = true;
|
||||
memcpy(&(module->addrs[i]), recv_buf + i * 2 * sizeof(uint64_t), sizeof(uint64_t));
|
||||
memcpy(&(module->state_addrs[i]), recv_buf + i * 2 * sizeof(uint64_t) + sizeof(uint64_t), sizeof(uint64_t));
|
||||
}
|
||||
|
||||
free(my_info);
|
||||
free(recv_buf);
|
||||
|
||||
if (rkey_buffer_size != 0) {
|
||||
ucp_rkey_buffer_release(rkey_buffer);
|
||||
}
|
||||
ucp_rkey_buffer_release(state_rkey_buffer);
|
||||
|
||||
/* init window state */
|
||||
module->state.lock = TARGET_LOCK_UNLOCKED;
|
||||
module->state.post_index = 0;
|
||||
memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX);
|
||||
@ -633,8 +394,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
module->post_group = NULL;
|
||||
OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t);
|
||||
OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
|
||||
module->global_ops_num = 0;
|
||||
module->per_target_ops_nums = calloc(comm_size, sizeof(int));
|
||||
module->start_grp_ranks = NULL;
|
||||
module->lock_all_is_nocheck = false;
|
||||
|
||||
@ -664,30 +423,8 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
return ret;
|
||||
|
||||
error:
|
||||
if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr);
|
||||
if (recv_buf) free(recv_buf);
|
||||
if (my_info) free(my_info);
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
if ((module->win_info_array[i]).rkey != NULL) {
|
||||
ucp_rkey_destroy((module->win_info_array[i]).rkey);
|
||||
}
|
||||
if ((module->state_info_array[i]).rkey != NULL) {
|
||||
ucp_rkey_destroy((module->state_info_array[i]).rkey);
|
||||
}
|
||||
}
|
||||
if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer);
|
||||
if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer);
|
||||
if (module->win_info_array) free(module->win_info_array);
|
||||
if (module->state_info_array) free(module->state_info_array);
|
||||
if (module->disp_units) free(module->disp_units);
|
||||
if (module->comm) ompi_comm_free(&module->comm);
|
||||
if (module->per_target_ops_nums) free(module->per_target_ops_nums);
|
||||
if (eps_created) {
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i);
|
||||
ucp_ep_destroy(ep);
|
||||
}
|
||||
}
|
||||
if (module) {
|
||||
free(module);
|
||||
ompi_osc_ucx_unregister_progress();
|
||||
@ -695,9 +432,8 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
|
||||
|
||||
error_nomem:
|
||||
if (env_initialized == true) {
|
||||
opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool);
|
||||
OBJ_DESTRUCT(&mca_osc_ucx_component.requests);
|
||||
ucp_worker_destroy(mca_osc_ucx_component.ucp_worker);
|
||||
ucp_cleanup(mca_osc_ucx_component.ucp_context);
|
||||
mca_osc_ucx_component.env_initialized = false;
|
||||
}
|
||||
return ret;
|
||||
@ -727,10 +463,7 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_
|
||||
int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
int insert_index = -1, contain_index;
|
||||
void *rkey_buffer;
|
||||
size_t rkey_buffer_size;
|
||||
int ret = OMPI_SUCCESS;
|
||||
ucs_status_t status;
|
||||
|
||||
if (module->state.dynamic_win_count >= OMPI_OSC_UCX_ATTACH_MAX) {
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
@ -757,8 +490,12 @@ int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) {
|
||||
insert_index = 0;
|
||||
}
|
||||
|
||||
ret = mem_map(&base, len, &(module->local_dynamic_win_info[insert_index].memh),
|
||||
module, MPI_WIN_FLAVOR_CREATE);
|
||||
ret = opal_common_ucx_wpmem_create(module->ctx, &base, len,
|
||||
OPAL_COMMON_UCX_MEM_MAP, &exchange_len_info,
|
||||
(void *)module->comm,
|
||||
&(module->local_dynamic_win_info[insert_index].my_mem_addr),
|
||||
&(module->local_dynamic_win_info[insert_index].my_mem_addr_size),
|
||||
&(module->local_dynamic_win_info[insert_index].mem));
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
@ -766,29 +503,20 @@ int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) {
|
||||
module->state.dynamic_wins[insert_index].base = (uint64_t)base;
|
||||
module->state.dynamic_wins[insert_index].size = len;
|
||||
|
||||
status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context,
|
||||
module->local_dynamic_win_info[insert_index].memh,
|
||||
&rkey_buffer, (size_t *)&rkey_buffer_size);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
assert(rkey_buffer_size <= OMPI_OSC_UCX_RKEY_BUF_MAX);
|
||||
memcpy((char *)(module->state.dynamic_wins[insert_index].rkey_buffer),
|
||||
(char *)rkey_buffer, rkey_buffer_size);
|
||||
memcpy((char *)(module->state.dynamic_wins[insert_index].mem_addr),
|
||||
(char *)module->local_dynamic_win_info[insert_index].my_mem_addr,
|
||||
module->local_dynamic_win_info[insert_index].my_mem_addr_size);
|
||||
|
||||
module->local_dynamic_win_info[insert_index].refcnt++;
|
||||
module->state.dynamic_win_count++;
|
||||
|
||||
ucp_rkey_buffer_release(rkey_buffer);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
int insert, contain;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
assert(module->state.dynamic_win_count > 0);
|
||||
|
||||
@ -804,8 +532,7 @@ int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) {
|
||||
|
||||
module->local_dynamic_win_info[contain].refcnt--;
|
||||
if (module->local_dynamic_win_info[contain].refcnt == 0) {
|
||||
ucp_mem_unmap(mca_osc_ucx_component.ucp_context,
|
||||
module->local_dynamic_win_info[contain].memh);
|
||||
ret = opal_common_ucx_wpmem_free(module->local_dynamic_win_info[contain].mem);
|
||||
memmove((void *)&(module->local_dynamic_win_info[contain]),
|
||||
(void *)&(module->local_dynamic_win_info[contain+1]),
|
||||
(OMPI_OSC_UCX_ATTACH_MAX - (contain + 1)) * sizeof(ompi_osc_local_dynamic_win_info_t));
|
||||
@ -816,49 +543,42 @@ int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) {
|
||||
module->state.dynamic_win_count--;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_free(struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
int i, ret;
|
||||
int ret;
|
||||
|
||||
assert(module->global_ops_num == 0);
|
||||
assert(module->lock_count == 0);
|
||||
assert(opal_list_is_empty(&module->pending_posts) == true);
|
||||
OBJ_DESTRUCT(&module->outstanding_locks);
|
||||
OBJ_DESTRUCT(&module->pending_posts);
|
||||
|
||||
while (module->state.lock != TARGET_LOCK_UNLOCKED) {
|
||||
/* not sure if this is required */
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", ret);
|
||||
}
|
||||
opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0);
|
||||
|
||||
ret = module->comm->c_coll->coll_barrier(module->comm,
|
||||
module->comm->c_coll->coll_barrier_module);
|
||||
|
||||
for (i = 0; i < ompi_comm_size(module->comm); i++) {
|
||||
if ((module->win_info_array[i]).rkey_init == true) {
|
||||
ucp_rkey_destroy((module->win_info_array[i]).rkey);
|
||||
(module->win_info_array[i]).rkey_init = false;
|
||||
}
|
||||
ucp_rkey_destroy((module->state_info_array[i]).rkey);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
free(module->win_info_array);
|
||||
free(module->state_info_array);
|
||||
|
||||
free(module->per_target_ops_nums);
|
||||
free(module->addrs);
|
||||
free(module->state_addrs);
|
||||
|
||||
if ((module->flavor == MPI_WIN_FLAVOR_ALLOCATE || module->flavor == MPI_WIN_FLAVOR_CREATE)
|
||||
&& module->size > 0) {
|
||||
ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->memh);
|
||||
ret = opal_common_ucx_wpmem_free(module->state_mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->state_memh);
|
||||
|
||||
ret = opal_common_ucx_wpmem_free(module->mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_common_ucx_wpctx_release(module->ctx);
|
||||
|
||||
opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool);
|
||||
|
||||
if (module->disp_units) free(module->disp_units);
|
||||
ompi_comm_free(&module->comm);
|
||||
|
@ -20,88 +20,73 @@ OBJ_CLASS_INSTANCE(ompi_osc_ucx_lock_t, opal_object_t, NULL, NULL);
|
||||
|
||||
static inline int start_shared(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t result_value = -1;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
int ret;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
while (true) {
|
||||
ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1,
|
||||
target, &result_value, sizeof(result_value),
|
||||
remote_addr);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert((int64_t)result_value >= 0);
|
||||
if (result_value >= TARGET_LOCK_EXCLUSIVE) {
|
||||
status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t),
|
||||
remote_addr, rkey);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_atomic_add64 failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
ret = opal_common_ucx_wpmem_post(module->state_mem,
|
||||
UCP_ATOMIC_POST_OP_ADD, (-1), target,
|
||||
sizeof(uint64_t), remote_addr);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int end_shared(ompi_osc_ucx_module_t *module, int target) {
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
|
||||
status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t),
|
||||
remote_addr, rkey);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_atomic_post(OP_ADD) failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
return opal_common_ucx_wpmem_post(module->state_mem, UCP_ATOMIC_POST_OP_ADD,
|
||||
(-1), target, sizeof(uint64_t), remote_addr);
|
||||
}
|
||||
|
||||
static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t result_value = -1;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
ucs_status_t status;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
while (result_value != TARGET_LOCK_UNLOCKED) {
|
||||
status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey,
|
||||
mca_osc_ucx_component.ucp_worker);
|
||||
if (status != UCS_OK) {
|
||||
return OMPI_ERROR;
|
||||
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem,
|
||||
TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE,
|
||||
target, &result_value, sizeof(result_value),
|
||||
remote_addr);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) {
|
||||
uint64_t result_value = 0;
|
||||
ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ucp_rkey_h rkey = (module->state_info_array)[target].rkey;
|
||||
uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
int ret;
|
||||
uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED,
|
||||
&result_value, sizeof(result_value),
|
||||
remote_addr, rkey, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_fetch(module->state_mem,
|
||||
UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED,
|
||||
target, &result_value, sizeof(result_value),
|
||||
remote_addr);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert(result_value >= TARGET_LOCK_EXCLUSIVE);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_lock(int lock_type, int target, int assert, struct ompi_win_t *win) {
|
||||
@ -158,7 +143,6 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
|
||||
ompi_osc_ucx_lock_t *lock = NULL;
|
||||
int ret = OMPI_SUCCESS;
|
||||
ucp_ep_h ep;
|
||||
|
||||
if (module->epoch_type.access != PASSIVE_EPOCH) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
@ -172,15 +156,11 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) {
|
||||
opal_hash_table_remove_value_uint32(&module->outstanding_locks,
|
||||
(uint32_t)target);
|
||||
|
||||
ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
module->global_ops_num -= module->per_target_ops_nums[target];
|
||||
module->per_target_ops_nums[target] = 0;
|
||||
|
||||
if (lock->is_nocheck == false) {
|
||||
if (lock->type == LOCK_EXCLUSIVE) {
|
||||
ret = end_exclusive(module, target);
|
||||
@ -195,7 +175,6 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) {
|
||||
assert(module->lock_count >= 0);
|
||||
if (module->lock_count == 0) {
|
||||
module->epoch_type.access = NONE_EPOCH;
|
||||
assert(module->global_ops_num == 0);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -244,14 +223,11 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) {
|
||||
|
||||
assert(module->lock_count == 0);
|
||||
|
||||
ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
module->global_ops_num = 0;
|
||||
memset(module->per_target_ops_nums, 0, sizeof(int) * comm_size);
|
||||
|
||||
if (!module->lock_all_is_nocheck) {
|
||||
int i;
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
@ -266,7 +242,7 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) {
|
||||
|
||||
int ompi_osc_ucx_sync(struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
|
||||
ucs_status_t status;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (module->epoch_type.access != PASSIVE_EPOCH &&
|
||||
module->epoch_type.access != PASSIVE_ALL_EPOCH) {
|
||||
@ -275,55 +251,45 @@ int ompi_osc_ucx_sync(struct ompi_win_t *win) {
|
||||
|
||||
opal_atomic_mb();
|
||||
|
||||
status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker);
|
||||
if (status != UCS_OK) {
|
||||
OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status);
|
||||
return OMPI_ERROR;
|
||||
ret = opal_common_ucx_wpmem_fence(module->mem);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module;
|
||||
ucp_ep_h ep;
|
||||
int ret;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (module->epoch_type.access != PASSIVE_EPOCH &&
|
||||
module->epoch_type.access != PASSIVE_ALL_EPOCH) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ep = OSC_UCX_GET_EP(module->comm, target);
|
||||
ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
module->global_ops_num -= module->per_target_ops_nums[target];
|
||||
module->per_target_ops_nums[target] = 0;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_ucx_flush_all(struct ompi_win_t *win) {
|
||||
ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module;
|
||||
int ret;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (module->epoch_type.access != PASSIVE_EPOCH &&
|
||||
module->epoch_type.access != PASSIVE_ALL_EPOCH) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker);
|
||||
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0);
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
module->global_ops_num = 0;
|
||||
memset(module->per_target_ops_nums, 0,
|
||||
sizeof(int) * ompi_comm_size(module->comm));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -45,20 +45,11 @@ static void request_construct(ompi_osc_ucx_request_t *request)
|
||||
request->super.req_cancel = request_cancel;
|
||||
}
|
||||
|
||||
void internal_req_init(void *request) {
|
||||
ompi_osc_ucx_internal_request_t *req = (ompi_osc_ucx_internal_request_t *)request;
|
||||
req->external_req = NULL;
|
||||
}
|
||||
|
||||
void req_completion(void *request, ucs_status_t status) {
|
||||
ompi_osc_ucx_internal_request_t *req = (ompi_osc_ucx_internal_request_t *)request;
|
||||
|
||||
if(req->external_req != NULL) {
|
||||
ompi_request_complete(&(req->external_req->super), true);
|
||||
ucp_request_release(req);
|
||||
mca_osc_ucx_component.num_incomplete_req_ops--;
|
||||
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
|
||||
}
|
||||
void req_completion(void *request) {
|
||||
ompi_osc_ucx_request_t *req = (ompi_osc_ucx_request_t *)request;
|
||||
ompi_request_complete(&(req->super), true);
|
||||
mca_osc_ucx_component.num_incomplete_req_ops--;
|
||||
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_ucx_request_t, ompi_request_t,
|
||||
|
@ -22,19 +22,14 @@ typedef struct ompi_osc_ucx_request {
|
||||
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_ucx_request_t);
|
||||
|
||||
typedef struct ompi_osc_ucx_internal_request {
|
||||
ompi_osc_ucx_request_t *external_req;
|
||||
} ompi_osc_ucx_internal_request_t;
|
||||
|
||||
#define OMPI_OSC_UCX_REQUEST_ALLOC(win, req) \
|
||||
do { \
|
||||
opal_free_list_item_t *item; \
|
||||
do { \
|
||||
item = opal_free_list_get(&mca_osc_ucx_component.requests); \
|
||||
if (item == NULL) { \
|
||||
if (mca_osc_ucx_component.ucp_worker != NULL && \
|
||||
mca_osc_ucx_component.num_incomplete_req_ops > 0) { \
|
||||
ucp_worker_progress(mca_osc_ucx_component.ucp_worker); \
|
||||
if (mca_osc_ucx_component.num_incomplete_req_ops > 0) { \
|
||||
opal_common_ucx_wpool_progress(mca_osc_ucx_component.wpool); \
|
||||
} \
|
||||
} \
|
||||
} while (item == NULL); \
|
||||
@ -53,4 +48,6 @@ typedef struct ompi_osc_ucx_internal_request {
|
||||
(opal_free_list_item_t*) req); \
|
||||
} while (0)
|
||||
|
||||
void req_completion(void *request);
|
||||
|
||||
#endif /* OMPI_OSC_UCX_REQUEST_H */
|
||||
|
@ -12,13 +12,16 @@
|
||||
|
||||
# Header files
|
||||
|
||||
headers = \
|
||||
common_ucx.h
|
||||
headers = \
|
||||
common_ucx.h \
|
||||
common_ucx_wpool.h \
|
||||
common_ucx_wpool_int.h
|
||||
|
||||
# Source files
|
||||
|
||||
sources = \
|
||||
common_ucx.c
|
||||
common_ucx.c \
|
||||
common_ucx_wpool.c
|
||||
|
||||
# Help file
|
||||
|
||||
|
@ -36,6 +36,9 @@ BEGIN_C_DECLS
|
||||
# define MCA_COMMON_UCX_ASSERT(_x)
|
||||
#endif
|
||||
|
||||
#define MCA_COMMON_UCX_PER_TARGET_OPS_THRESHOLD 1000
|
||||
#define MCA_COMMON_UCX_GLOBAL_OPS_THRESHOLD 1000
|
||||
|
||||
#define _MCA_COMMON_UCX_QUOTE(_x) \
|
||||
# _x
|
||||
#define MCA_COMMON_UCX_QUOTE(_x) \
|
||||
@ -178,6 +181,17 @@ int opal_common_ucx_atomic_fetch(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode,
|
||||
return opal_common_ucx_wait_request(request, worker, "ucp_atomic_fetch_nb");
|
||||
}
|
||||
|
||||
static inline
|
||||
ucs_status_ptr_t opal_common_ucx_atomic_fetch_nb(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode,
|
||||
uint64_t value, void *result, size_t op_size,
|
||||
uint64_t remote_addr, ucp_rkey_h rkey,
|
||||
ucp_send_callback_t req_handler,
|
||||
ucp_worker_h worker)
|
||||
{
|
||||
return ucp_atomic_fetch_nb(ep, opcode, value, result, op_size,
|
||||
remote_addr, rkey, req_handler);
|
||||
}
|
||||
|
||||
static inline
|
||||
int opal_common_ucx_atomic_cswap(ucp_ep_h ep, uint64_t compare,
|
||||
uint64_t value, void *result, size_t op_size,
|
||||
|
1241
opal/mca/common/ucx/common_ucx_wpool.c
Обычный файл
1241
opal/mca/common/ucx/common_ucx_wpool.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
545
opal/mca/common/ucx/common_ucx_wpool.h
Обычный файл
545
opal/mca/common/ucx/common_ucx_wpool.h
Обычный файл
@ -0,0 +1,545 @@
|
||||
#ifndef COMMON_UCX_WPOOL_H
|
||||
#define COMMON_UCX_WPOOL_H
|
||||
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "common_ucx.h"
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <ucp/api/ucp.h>
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/include/opal/constants.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* Worker pool is a global object that that is allocated per component or can be
|
||||
* shared between multiple compatible components.
|
||||
* The lifetime of this object is normally equal to the lifetime of a component[s].
|
||||
* It is expected to be initialized in MPI_Init and finalized in MPI_Finalize.
|
||||
*/
|
||||
typedef struct {
|
||||
/* Ref counting & locking*/
|
||||
int refcnt;
|
||||
opal_recursive_mutex_t mutex;
|
||||
|
||||
/* UCX data */
|
||||
ucp_context_h ucp_ctx;
|
||||
ucp_worker_h dflt_worker;
|
||||
ucp_address_t *recv_waddr;
|
||||
size_t recv_waddr_len;
|
||||
|
||||
/* Thread-local key to allow each thread to have
|
||||
* local information assisiated with this wpool */
|
||||
opal_tsd_key_t tls_key;
|
||||
|
||||
/* Bookkeeping information */
|
||||
opal_list_t idle_workers;
|
||||
opal_list_t active_workers;
|
||||
|
||||
opal_list_t tls_list;
|
||||
} opal_common_ucx_wpool_t;
|
||||
|
||||
/* Worker Pool Context (wpctx) is an object that is comprised of a set of UCP
|
||||
* workers that are considered as one logical communication entity.
|
||||
* One UCP worker per "active" thread is used.
|
||||
* Thread is considered "active" if it performs communication operations on this
|
||||
* Wpool context.
|
||||
* A lifetime of this object is dynamic and determined by the application
|
||||
* (the object is created and destroyed with corresponding functions).
|
||||
* Context is bound to a particular Worker Pool object.
|
||||
*/
|
||||
typedef struct {
|
||||
opal_recursive_mutex_t mutex;
|
||||
opal_atomic_int32_t refcntr;
|
||||
|
||||
/* the reference to a Worker pool this context belongs to*/
|
||||
opal_common_ucx_wpool_t *wpool;
|
||||
/* A list of references to TLS context records
|
||||
* we need to keep track of them to have an ability to
|
||||
* let thread know that this context is no longer valid */
|
||||
opal_list_t tls_workers;
|
||||
volatile int released;
|
||||
|
||||
/* UCX addressing information */
|
||||
char *recv_worker_addrs;
|
||||
int *recv_worker_displs;
|
||||
size_t comm_size;
|
||||
} opal_common_ucx_ctx_t;
|
||||
|
||||
/* Worker Pool memory (wpmem) is an object that represents a remotely accessible
|
||||
* distributed memory.
|
||||
* It has dynamic lifetime (created and destroyed by corresponding functions).
|
||||
* It depends on particular Wpool context.
|
||||
* Currently OSC is using one context per MPI Window, though in future it will
|
||||
* be possible to have one context for multiple windows.
|
||||
*/
|
||||
typedef struct {
|
||||
/* reference context to which memory region belongs */
|
||||
opal_common_ucx_ctx_t *ctx;
|
||||
|
||||
/* object lifetime control */
|
||||
volatile int released;
|
||||
opal_atomic_int32_t refcntr;
|
||||
|
||||
/* UCX memory handler */
|
||||
ucp_mem_h memh;
|
||||
char *mem_addrs;
|
||||
int *mem_displs;
|
||||
|
||||
/* TLS item that allows each thread to
|
||||
* store endpoints and rkey arrays
|
||||
* for faster access */
|
||||
opal_tsd_key_t mem_tls_key;
|
||||
} opal_common_ucx_wpmem_t;
|
||||
|
||||
/* The structure that wraps UCP worker and holds the state that is required
|
||||
* for its use.
|
||||
* The structure is allocated along with UCP worker on demand and is being held
|
||||
* in the Worker Pool lists (either active or idle).
|
||||
* One wpmem is intended per shared memory segment (i.e. MPI Window).
|
||||
*/
|
||||
typedef struct opal_common_ucx_winfo {
|
||||
opal_recursive_mutex_t mutex;
|
||||
volatile int released;
|
||||
ucp_worker_h worker;
|
||||
ucp_ep_h *endpoints;
|
||||
size_t comm_size;
|
||||
short *inflight_ops;
|
||||
short global_inflight_ops;
|
||||
ucs_status_ptr_t inflight_req;
|
||||
} opal_common_ucx_winfo_t;
|
||||
|
||||
typedef struct {
|
||||
opal_common_ucx_winfo_t *winfo;
|
||||
ucp_rkey_h *rkeys;
|
||||
} opal_common_ucx_tlocal_fast_ptrs_t;
|
||||
|
||||
typedef void (*opal_common_ucx_user_req_handler_t)(void *request);
|
||||
|
||||
/* A fast-path structure that gathers all pointers that are required to
|
||||
* perform RMA operation
|
||||
* wpmem's mem_tls_key holds the pointer to this structure
|
||||
*/
|
||||
typedef struct {
|
||||
void *ext_req;
|
||||
opal_common_ucx_user_req_handler_t ext_cb;
|
||||
opal_common_ucx_winfo_t *winfo;
|
||||
} opal_common_ucx_request_t;
|
||||
|
||||
typedef enum {
|
||||
OPAL_COMMON_UCX_PUT,
|
||||
OPAL_COMMON_UCX_GET
|
||||
} opal_common_ucx_op_t;
|
||||
|
||||
typedef enum {
|
||||
OPAL_COMMON_UCX_SCOPE_EP,
|
||||
OPAL_COMMON_UCX_SCOPE_WORKER
|
||||
} opal_common_ucx_flush_scope_t;
|
||||
|
||||
typedef enum {
|
||||
OPAL_COMMON_UCX_FLUSH_NB,
|
||||
OPAL_COMMON_UCX_FLUSH_B,
|
||||
OPAL_COMMON_UCX_FLUSH_NB_PREFERRED
|
||||
} opal_common_ucx_flush_type_t;
|
||||
|
||||
typedef enum {
|
||||
OPAL_COMMON_UCX_MEM_ALLOCATE_MAP,
|
||||
OPAL_COMMON_UCX_MEM_MAP
|
||||
} opal_common_ucx_mem_type_t;
|
||||
|
||||
typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len,
|
||||
char **recv_info, int **disps,
|
||||
void *metadata);
|
||||
|
||||
|
||||
/* Manage Worker Pool (wpool) */
|
||||
OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void);
|
||||
OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool);
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool,
|
||||
int proc_world_size, bool enable_mt);
|
||||
OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool);
|
||||
OPAL_DECLSPEC void opal_common_ucx_wpool_progress(opal_common_ucx_wpool_t *wpool);
|
||||
|
||||
/* Manage Communication context */
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpctx_create(opal_common_ucx_wpool_t *wpool, int comm_size,
|
||||
opal_common_ucx_exchange_func_t exchange_func,
|
||||
void *exchange_metadata,
|
||||
opal_common_ucx_ctx_t **ctx_ptr);
|
||||
OPAL_DECLSPEC void opal_common_ucx_wpctx_release(opal_common_ucx_ctx_t *ctx);
|
||||
|
||||
/* request init / completion */
|
||||
OPAL_DECLSPEC void opal_common_ucx_req_init(void *request);
|
||||
OPAL_DECLSPEC void opal_common_ucx_req_completion(void *request, ucs_status_t status);
|
||||
|
||||
/* Managing thread local storage */
|
||||
OPAL_DECLSPEC int opal_common_ucx_tlocal_fetch_spath(opal_common_ucx_wpmem_t *mem, int target);
|
||||
static inline int
|
||||
opal_common_ucx_tlocal_fetch(opal_common_ucx_wpmem_t *mem, int target,
|
||||
ucp_ep_h *_ep, ucp_rkey_h *_rkey,
|
||||
opal_common_ucx_winfo_t **_winfo)
|
||||
{
|
||||
opal_common_ucx_tlocal_fast_ptrs_t *fp = NULL;
|
||||
int expr;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
/* First check the fast-path */
|
||||
rc = opal_tsd_getspecific(mem->mem_tls_key, (void**)&fp);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
expr = fp && (NULL != fp->winfo) && (fp->winfo->endpoints[target]) &&
|
||||
(NULL != fp->rkeys[target]);
|
||||
if (OPAL_UNLIKELY(!expr)) {
|
||||
rc = opal_common_ucx_tlocal_fetch_spath(mem, target);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
rc = opal_tsd_getspecific(mem->mem_tls_key, (void**)&fp);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
MCA_COMMON_UCX_ASSERT(fp && (NULL != fp->winfo) &&
|
||||
(fp->winfo->endpoints[target])
|
||||
&& (NULL != fp->rkeys[target]));
|
||||
|
||||
*_rkey = fp->rkeys[target];
|
||||
*_winfo = fp->winfo;
|
||||
*_ep = fp->winfo->endpoints[target];
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Manage & operations on the Memory registrations */
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpmem_create(opal_common_ucx_ctx_t *ctx,
|
||||
void **mem_base, size_t mem_size,
|
||||
opal_common_ucx_mem_type_t mem_type,
|
||||
opal_common_ucx_exchange_func_t exchange_func,
|
||||
void *exchange_metadata,
|
||||
char **my_mem_addr,
|
||||
int *my_mem_addr_size,
|
||||
opal_common_ucx_wpmem_t **mem_ptr);
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpmem_free(opal_common_ucx_wpmem_t *mem);
|
||||
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpmem_flush(opal_common_ucx_wpmem_t *mem,
|
||||
opal_common_ucx_flush_scope_t scope,
|
||||
int target);
|
||||
OPAL_DECLSPEC int opal_common_ucx_wpmem_fence(opal_common_ucx_wpmem_t *mem);
|
||||
|
||||
OPAL_DECLSPEC int opal_common_ucx_winfo_flush(opal_common_ucx_winfo_t *winfo, int target,
|
||||
opal_common_ucx_flush_type_t type,
|
||||
opal_common_ucx_flush_scope_t scope,
|
||||
ucs_status_ptr_t *req_ptr);
|
||||
|
||||
static inline
|
||||
int opal_common_ucx_wait_request_mt(ucs_status_ptr_t request, const char *msg)
|
||||
{
|
||||
ucs_status_t status;
|
||||
int ctr = 0, ret = 0;
|
||||
opal_common_ucx_winfo_t *winfo;
|
||||
|
||||
/* check for request completed or failed */
|
||||
if (OPAL_LIKELY(UCS_OK == request)) {
|
||||
return OPAL_SUCCESS;
|
||||
} else if (OPAL_UNLIKELY(UCS_PTR_IS_ERR(request))) {
|
||||
MCA_COMMON_UCX_VERBOSE(1, "%s failed: %d, %s", msg ? msg : __func__,
|
||||
UCS_PTR_STATUS(request),
|
||||
ucs_status_string(UCS_PTR_STATUS(request)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
winfo = ((opal_common_ucx_request_t *)request)->winfo;
|
||||
assert(winfo != NULL);
|
||||
|
||||
do {
|
||||
ctr = opal_common_ucx.progress_iterations;
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
do {
|
||||
ret = ucp_worker_progress(winfo->worker);
|
||||
status = opal_common_ucx_request_status(request);
|
||||
if (status != UCS_INPROGRESS) {
|
||||
ucp_request_free(request);
|
||||
if (OPAL_UNLIKELY(UCS_OK != status)) {
|
||||
MCA_COMMON_UCX_VERBOSE(1, "%s failed: %d, %s",
|
||||
msg ? msg : __func__,
|
||||
UCS_PTR_STATUS(request),
|
||||
ucs_status_string(UCS_PTR_STATUS(request)));
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
ctr--;
|
||||
} while (ctr > 0 && ret > 0 && status == UCS_INPROGRESS);
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
opal_progress();
|
||||
} while (status == UCS_INPROGRESS);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int _periodical_flush_nb(opal_common_ucx_wpmem_t *mem,
|
||||
opal_common_ucx_winfo_t *winfo,
|
||||
int target) {
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
winfo->inflight_ops[target]++;
|
||||
winfo->global_inflight_ops++;
|
||||
|
||||
if (OPAL_UNLIKELY(winfo->inflight_ops[target] >= MCA_COMMON_UCX_PER_TARGET_OPS_THRESHOLD) ||
|
||||
OPAL_UNLIKELY(winfo->global_inflight_ops >= MCA_COMMON_UCX_GLOBAL_OPS_THRESHOLD)) {
|
||||
opal_common_ucx_flush_scope_t scope;
|
||||
|
||||
if (winfo->inflight_req != UCS_OK) {
|
||||
rc = opal_common_ucx_wait_request_mt(winfo->inflight_req,
|
||||
"opal_common_ucx_flush_nb");
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_wait_request failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
winfo->inflight_req = UCS_OK;
|
||||
}
|
||||
|
||||
if (winfo->global_inflight_ops >= MCA_COMMON_UCX_GLOBAL_OPS_THRESHOLD) {
|
||||
scope = OPAL_COMMON_UCX_SCOPE_WORKER;
|
||||
winfo->global_inflight_ops = 0;
|
||||
memset(winfo->inflight_ops, 0, winfo->comm_size * sizeof(short));
|
||||
} else {
|
||||
scope = OPAL_COMMON_UCX_SCOPE_EP;
|
||||
winfo->global_inflight_ops -= winfo->inflight_ops[target];
|
||||
winfo->inflight_ops[target] = 0;
|
||||
}
|
||||
|
||||
rc = opal_common_ucx_winfo_flush(winfo, target, OPAL_COMMON_UCX_FLUSH_NB_PREFERRED,
|
||||
scope, &winfo->inflight_req);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_flush failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
} else if (OPAL_UNLIKELY(winfo->inflight_req != UCS_OK)) {
|
||||
int ret;
|
||||
do {
|
||||
ret = ucp_worker_progress(winfo->worker);
|
||||
} while (ret);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int
|
||||
opal_common_ucx_wpmem_putget(opal_common_ucx_wpmem_t *mem, opal_common_ucx_op_t op,
|
||||
int target, void *buffer, size_t len,
|
||||
uint64_t rem_addr)
|
||||
{
|
||||
ucp_ep_h ep;
|
||||
ucp_rkey_h rkey;
|
||||
ucs_status_t status;
|
||||
opal_common_ucx_winfo_t *winfo;
|
||||
int rc = OPAL_SUCCESS;
|
||||
char *called_func = "";
|
||||
|
||||
rc = opal_common_ucx_tlocal_fetch(mem, target, &ep, &rkey, &winfo);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Perform the operation */
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
switch(op){
|
||||
case OPAL_COMMON_UCX_PUT:
|
||||
status = ucp_put_nbi(ep, buffer,len, rem_addr, rkey);
|
||||
called_func = "ucp_put_nbi";
|
||||
break;
|
||||
case OPAL_COMMON_UCX_GET:
|
||||
status = ucp_get_nbi(ep, buffer,len, rem_addr, rkey);
|
||||
called_func = "ucp_get_nbi";
|
||||
break;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(status != UCS_OK && status != UCS_INPROGRESS)) {
|
||||
MCA_COMMON_UCX_ERROR("%s failed: %d", called_func, status);
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
|
||||
rc = _periodical_flush_nb(mem, winfo, target);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "_incr_and_check_inflight_ops failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static inline int
|
||||
opal_common_ucx_wpmem_cmpswp(opal_common_ucx_wpmem_t *mem, uint64_t compare,
|
||||
uint64_t value, int target, void *buffer, size_t len,
|
||||
uint64_t rem_addr)
|
||||
{
|
||||
ucp_ep_h ep;
|
||||
ucp_rkey_h rkey;
|
||||
opal_common_ucx_winfo_t *winfo = NULL;
|
||||
ucs_status_t status;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
rc = opal_common_ucx_tlocal_fetch(mem, target, &ep, &rkey, &winfo);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
MCA_COMMON_UCX_ERROR("opal_common_ucx_tlocal_fetch failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Perform the operation */
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
status = opal_common_ucx_atomic_cswap(ep, compare, value,
|
||||
buffer, len,
|
||||
rem_addr, rkey,
|
||||
winfo->worker);
|
||||
if (OPAL_UNLIKELY(status != UCS_OK)) {
|
||||
MCA_COMMON_UCX_ERROR("opal_common_ucx_atomic_cswap failed: %d", status);
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
|
||||
rc = _periodical_flush_nb(mem, winfo, target);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "_incr_and_check_inflight_ops failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int
|
||||
opal_common_ucx_wpmem_post(opal_common_ucx_wpmem_t *mem, ucp_atomic_post_op_t opcode,
|
||||
uint64_t value, int target, size_t len, uint64_t rem_addr)
|
||||
{
|
||||
ucp_ep_h ep;
|
||||
ucp_rkey_h rkey;
|
||||
opal_common_ucx_winfo_t *winfo = NULL;
|
||||
ucs_status_t status;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
|
||||
rc =opal_common_ucx_tlocal_fetch(mem, target, &ep, &rkey, &winfo);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_ERROR("tlocal_fetch failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Perform the operation */
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
status = ucp_atomic_post(ep, opcode, value,
|
||||
len, rem_addr, rkey);
|
||||
if (OPAL_UNLIKELY(status != UCS_OK)) {
|
||||
MCA_COMMON_UCX_ERROR("ucp_atomic_post failed: %d", status);
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
|
||||
rc = _periodical_flush_nb(mem, winfo, target);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "_incr_and_check_inflight_ops failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int
|
||||
opal_common_ucx_wpmem_fetch(opal_common_ucx_wpmem_t *mem,
|
||||
ucp_atomic_fetch_op_t opcode, uint64_t value,
|
||||
int target, void *buffer, size_t len,
|
||||
uint64_t rem_addr)
|
||||
{
|
||||
ucp_ep_h ep = NULL;
|
||||
ucp_rkey_h rkey = NULL;
|
||||
opal_common_ucx_winfo_t *winfo = NULL;
|
||||
ucs_status_t status;
|
||||
int rc = OPAL_SUCCESS;
|
||||
|
||||
rc = opal_common_ucx_tlocal_fetch(mem, target, &ep, &rkey, &winfo);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_ERROR("tlocal_fetch failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Perform the operation */
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
status = opal_common_ucx_atomic_fetch(ep, opcode, value,
|
||||
buffer, len,
|
||||
rem_addr, rkey,
|
||||
winfo->worker);
|
||||
if (OPAL_UNLIKELY(status != UCS_OK)) {
|
||||
MCA_COMMON_UCX_ERROR("ucp_atomic_cswap64 failed: %d", status);
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
|
||||
rc = _periodical_flush_nb(mem, winfo, target);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "_incr_and_check_inflight_ops failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static inline int
|
||||
opal_common_ucx_wpmem_fetch_nb(opal_common_ucx_wpmem_t *mem,
|
||||
ucp_atomic_fetch_op_t opcode,
|
||||
uint64_t value,
|
||||
int target, void *buffer, size_t len,
|
||||
uint64_t rem_addr,
|
||||
opal_common_ucx_user_req_handler_t user_req_cb,
|
||||
void *user_req_ptr)
|
||||
{
|
||||
ucp_ep_h ep = NULL;
|
||||
ucp_rkey_h rkey = NULL;
|
||||
opal_common_ucx_winfo_t *winfo = NULL;
|
||||
int rc = OPAL_SUCCESS;
|
||||
opal_common_ucx_request_t *req;
|
||||
|
||||
rc = opal_common_ucx_tlocal_fetch(mem, target, &ep, &rkey, &winfo);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_ERROR("tlocal_fetch failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
/* Perform the operation */
|
||||
opal_mutex_lock(&winfo->mutex);
|
||||
req = opal_common_ucx_atomic_fetch_nb(ep, opcode, value, buffer, len,
|
||||
rem_addr, rkey, opal_common_ucx_req_completion,
|
||||
winfo->worker);
|
||||
if (UCS_PTR_IS_PTR(req)) {
|
||||
req->ext_req = user_req_ptr;
|
||||
req->ext_cb = user_req_cb;
|
||||
req->winfo = winfo;
|
||||
} else {
|
||||
if (user_req_cb != NULL) {
|
||||
(*user_req_cb)(user_req_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
rc = _periodical_flush_nb(mem, winfo, target);
|
||||
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)){
|
||||
MCA_COMMON_UCX_VERBOSE(1, "_incr_and_check_inflight_ops failed: %d", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
opal_mutex_unlock(&winfo->mutex);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif // COMMON_UCX_WPOOL_H
|
112
opal/mca/common/ucx/common_ucx_wpool_int.h
Обычный файл
112
opal/mca/common/ucx/common_ucx_wpool_int.h
Обычный файл
@ -0,0 +1,112 @@
|
||||
#ifndef COMMON_UCX_WPOOL_INT_H
|
||||
#define COMMON_UCX_WPOOL_INT_H
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "common_ucx.h"
|
||||
#include "common_ucx_wpool.h"
|
||||
|
||||
typedef struct {
|
||||
opal_common_ucx_ctx_t *gctx;
|
||||
opal_common_ucx_winfo_t *winfo;
|
||||
opal_atomic_int32_t refcnt;
|
||||
} _tlocal_ctx_t;
|
||||
|
||||
typedef struct {
|
||||
opal_common_ucx_winfo_t *worker;
|
||||
ucp_rkey_h *rkeys;
|
||||
} _mem_info_t;
|
||||
|
||||
typedef struct {
|
||||
opal_common_ucx_wpmem_t *gmem;
|
||||
_mem_info_t *mem;
|
||||
opal_common_ucx_tlocal_fast_ptrs_t *mem_tls_ptr;
|
||||
_tlocal_ctx_t *ctx_rec;
|
||||
} _tlocal_mem_t;
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
opal_common_ucx_winfo_t *ptr;
|
||||
} _winfo_list_item_t;
|
||||
OBJ_CLASS_DECLARATION(_winfo_list_item_t);
|
||||
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
opal_common_ucx_winfo_t *ptr;
|
||||
} _ctx_record_list_item_t;
|
||||
OBJ_CLASS_DECLARATION(_ctx_record_list_item_t);
|
||||
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
_tlocal_mem_t *ptr;
|
||||
} _mem_record_list_item_t;
|
||||
OBJ_CLASS_DECLARATION(_mem_record_list_item_t);
|
||||
|
||||
/* thread-local table */
|
||||
typedef struct {
|
||||
opal_list_item_t super;
|
||||
opal_common_ucx_wpool_t *wpool;
|
||||
_tlocal_ctx_t **ctx_tbl;
|
||||
size_t ctx_tbl_size;
|
||||
_tlocal_mem_t **mem_tbl;
|
||||
size_t mem_tbl_size;
|
||||
} _tlocal_table_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(_tlocal_table_t);
|
||||
|
||||
static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append);
|
||||
static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append);
|
||||
static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool);
|
||||
static void _common_ucx_tls_cleanup(_tlocal_table_t *tls);
|
||||
static inline _tlocal_ctx_t *_tlocal_ctx_search(_tlocal_table_t *tls,
|
||||
opal_common_ucx_ctx_t *ctx);
|
||||
static int _tlocal_ctx_record_cleanup(_tlocal_ctx_t *ctx_rec);
|
||||
static _tlocal_ctx_t *_tlocal_add_ctx(_tlocal_table_t *tls,
|
||||
opal_common_ucx_ctx_t *ctx);
|
||||
static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target);
|
||||
static inline _tlocal_mem_t *_tlocal_search_mem(_tlocal_table_t *tls,
|
||||
opal_common_ucx_wpmem_t *gmem);
|
||||
static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls,
|
||||
opal_common_ucx_wpmem_t *mem);
|
||||
static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int target);
|
||||
// TOD: Return the error from it
|
||||
static void _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec);
|
||||
static void _tlocal_cleanup(void *arg);
|
||||
|
||||
/* Sorted declarations */
|
||||
|
||||
|
||||
/* Internal Worker Information (winfo) management */
|
||||
static opal_common_ucx_winfo_t *_winfo_create(opal_common_ucx_wpool_t *wpool);
|
||||
static void _winfo_release(opal_common_ucx_winfo_t *winfo);
|
||||
static void _winfo_reset(opal_common_ucx_winfo_t *winfo);
|
||||
|
||||
/* Internal Worker Pool (wpool) management */
|
||||
static int _wpool_list_put(opal_common_ucx_wpool_t *wpool, opal_list_t *list,
|
||||
opal_common_ucx_winfo_t *winfo);
|
||||
static int _wpool_list_put(opal_common_ucx_wpool_t *wpool, opal_list_t *list,
|
||||
opal_common_ucx_winfo_t *winfo);
|
||||
static opal_common_ucx_winfo_t *_wpool_list_get(opal_common_ucx_wpool_t *wpool,
|
||||
opal_list_t *list);
|
||||
static opal_common_ucx_winfo_t *_wpool_get_idle(opal_common_ucx_wpool_t *wpool,
|
||||
size_t comm_size);
|
||||
static int _wpool_add_active(opal_common_ucx_wpool_t *wpool,
|
||||
opal_common_ucx_winfo_t *winfo);
|
||||
|
||||
/* Internal Worker Pool Context management */
|
||||
static void _common_ucx_wpctx_free(opal_common_ucx_ctx_t *ctx);
|
||||
static int _common_ucx_wpctx_append(opal_common_ucx_ctx_t *ctx,
|
||||
opal_common_ucx_winfo_t *winfo);
|
||||
static void _common_ucx_wpctx_remove(opal_common_ucx_ctx_t *ctx,
|
||||
opal_common_ucx_winfo_t *winfo);
|
||||
|
||||
/* Internal Worker Pool Memeory management */
|
||||
static int _comm_ucx_wpmem_map(opal_common_ucx_wpool_t *wpool,
|
||||
void **base, size_t size, ucp_mem_h *memh_ptr,
|
||||
opal_common_ucx_mem_type_t mem_type);
|
||||
static void _common_ucx_wpmem_free(opal_common_ucx_wpmem_t *mem);
|
||||
static int _common_ucx_wpmem_signup(opal_common_ucx_wpmem_t *mem);
|
||||
static void _common_ucx_mem_signout(opal_common_ucx_wpmem_t *mem);
|
||||
|
||||
|
||||
#endif // COMMON_UCX_WPOOL_INT_H
|
Загрузка…
x
Ссылка в новой задаче
Block a user