diff --git a/RMA_TODO.txt b/RMA_TODO.txt new file mode 100644 index 0000000000..9f1b28e6a9 --- /dev/null +++ b/RMA_TODO.txt @@ -0,0 +1,5 @@ +- Support non-contiguous operations in Portals4 implementation +- Add memory barriers where needed in synchronization primitives of + Portals4 implementation + +- Re-implement rdma component diff --git a/ompi/attribute/attribute_predefined.c b/ompi/attribute/attribute_predefined.c index 0c9acae236..c2d3d631f1 100644 --- a/ompi/attribute/attribute_predefined.c +++ b/ompi/attribute/attribute_predefined.c @@ -127,6 +127,8 @@ int ompi_attr_create_predefined(void) OMPI_SUCCESS != (ret = create_win(MPI_WIN_BASE)) || OMPI_SUCCESS != (ret = create_win(MPI_WIN_SIZE)) || OMPI_SUCCESS != (ret = create_win(MPI_WIN_DISP_UNIT)) || + OMPI_SUCCESS != (ret = create_win(MPI_WIN_CREATE_FLAVOR)) || + OMPI_SUCCESS != (ret = create_win(MPI_WIN_MODEL)) || #if 0 /* JMS For when we implement IMPI */ OMPI_SUCCESS != (ret = create_comm(IMPI_CLIENT_SIZE, true)) || @@ -193,6 +195,8 @@ int ompi_attr_free_predefined(void) OMPI_SUCCESS != (ret = free_win(MPI_WIN_BASE)) || OMPI_SUCCESS != (ret = free_win(MPI_WIN_SIZE)) || OMPI_SUCCESS != (ret = free_win(MPI_WIN_DISP_UNIT)) || + OMPI_SUCCESS != (ret = free_win(MPI_WIN_CREATE_FLAVOR)) || + OMPI_SUCCESS != (ret = free_win(MPI_WIN_MODEL)) || #if 0 /* JMS For when we implement IMPI */ OMPI_SUCCESS != (ret = free_comm(IMPI_CLIENT_SIZE)) || diff --git a/ompi/debuggers/predefined_gap_test.c b/ompi/debuggers/predefined_gap_test.c index 727f2c2236..5ddf176ff2 100644 --- a/ompi/debuggers/predefined_gap_test.c +++ b/ompi/debuggers/predefined_gap_test.c @@ -128,11 +128,7 @@ int main(int argc, char **argv) { GAP_CHECK("w_f_to_c_index", test_win, w_f_to_c_index, w_keyhash, 1); GAP_CHECK("error_handler", test_win, error_handler, w_f_to_c_index, 1); GAP_CHECK("errhandler_type", test_win, errhandler_type, error_handler, 1); - GAP_CHECK("w_disp_unit", test_win, w_disp_unit, errhandler_type, 1); - GAP_CHECK("w_baseptr", test_win, w_baseptr, w_disp_unit, 1); - GAP_CHECK("w_size", test_win, w_size, w_baseptr, 1); - GAP_CHECK("w_mode", test_win, w_mode, w_size, 1); - GAP_CHECK("w_osc_module", test_win, w_osc_module, w_size, 1); + GAP_CHECK("w_osc_module", test_win, w_osc_module, errhandler_type, 1); /* Test Predefined info sizes */ printf("=============================================\n"); diff --git a/ompi/errhandler/errcode.c b/ompi/errhandler/errcode.c index 477bb32f88..a02b139377 100644 --- a/ompi/errhandler/errcode.c +++ b/ompi/errhandler/errcode.c @@ -107,6 +107,9 @@ ompi_mpi_errcode_t ompi_t_err_cvar_set_never; ompi_mpi_errcode_t ompi_t_err_pvar_no_startstop; ompi_mpi_errcode_t ompi_t_err_pvar_no_write; ompi_mpi_errcode_t ompi_t_err_pvar_no_atomic; +ompi_mpi_errcode_t ompi_err_rma_range; +ompi_mpi_errcode_t ompi_err_rma_attach; +ompi_mpi_errcode_t ompi_err_rma_flavor; static void ompi_mpi_errcode_construct(ompi_mpi_errcode_t* errcode); static void ompi_mpi_errcode_destruct(ompi_mpi_errcode_t* errcode); @@ -202,6 +205,9 @@ int ompi_mpi_errcode_init (void) CONSTRUCT_ERRCODE( ompi_t_err_pvar_no_startstop, MPI_T_ERR_PVAR_NO_STARTSTOP, "MPI_T_ERR_PVAR_NO_STARTSTOP: variable cannot be started or stopped" ); CONSTRUCT_ERRCODE( ompi_t_err_pvar_no_write, MPI_T_ERR_PVAR_NO_WRITE, "MPI_T_ERR_PVAR_NO_WRITE: variable cannot be written or reset" ); CONSTRUCT_ERRCODE( ompi_t_err_pvar_no_atomic, MPI_T_ERR_PVAR_NO_ATOMIC, "MPI_T_ERR_PVAR_NO_ATOMIC: variable cannot be read and written atomically" ); + CONSTRUCT_ERRCODE( ompi_err_rma_range, MPI_ERR_RMA_RANGE, "MPI_ERR_RMA_RANGE: invalid RMA address range" ); + CONSTRUCT_ERRCODE( ompi_err_rma_attach, MPI_ERR_RMA_ATTACH, "MPI_ERR_RMA_ATTACH: Could not attach RMA segment" ); + CONSTRUCT_ERRCODE( ompi_err_rma_flavor, MPI_ERR_RMA_FLAVOR, "MPI_ERR_RMA_FLAVOR: Invalid type of window" ); /* Per MPI-3 p353:27-32, MPI_LASTUSEDCODE must be >= MPI_ERR_LASTCODE. So just start it as == MPI_ERR_LASTCODE. */ @@ -292,6 +298,9 @@ int ompi_mpi_errcode_finalize(void) OBJ_DESTRUCT(&ompi_t_err_pvar_no_startstop); OBJ_DESTRUCT(&ompi_t_err_pvar_no_write); OBJ_DESTRUCT(&ompi_t_err_pvar_no_atomic); + OBJ_DESTRUCT(&ompi_err_rma_range); + OBJ_DESTRUCT(&ompi_err_rma_attach); + OBJ_DESTRUCT(&ompi_err_rma_flavor); OBJ_DESTRUCT(&ompi_mpi_errcodes); return OMPI_SUCCESS; diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index ef663eb34e..1029908459 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -13,7 +13,7 @@ * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009-2012 Oak Rigde National Laboratory. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2012-2013 Los Alamos Nat Security, LLC. All rights reserved. + * Copyright (c) 2012-2014 Los Alamos Nat Security, LLC. All rights reserved. * Copyright (c) 2011-2013 INRIA. All rights reserved. * $COPYRIGHT$ * @@ -488,6 +488,13 @@ typedef int (MPI_Grequest_cancel_function)(void *, int); #define MPI_LOCK_EXCLUSIVE 1 #define MPI_LOCK_SHARED 2 +#define MPI_WIN_FLAVOR_CREATE 1 +#define MPI_WIN_FLAVOR_ALLOCATE 2 +#define MPI_WIN_FLAVOR_DYNAMIC 3 +#define MPI_WIN_FLAVOR_SHARED 4 + +#define MPI_WIN_UNIFIED 0 +#define MPI_WIN_SEPARATE 1 /* * Predefined attribute keyvals @@ -509,6 +516,8 @@ enum { MPI_WIN_BASE, MPI_WIN_SIZE, MPI_WIN_DISP_UNIT, + MPI_WIN_CREATE_FLAVOR, + MPI_WIN_MODEL, /* Even though these four are IMPI attributes, they need to be there for all MPI jobs */ @@ -590,10 +599,14 @@ enum { #define MPI_T_ERR_PVAR_NO_STARTSTOP 65 #define MPI_T_ERR_PVAR_NO_WRITE 66 #define MPI_T_ERR_PVAR_NO_ATOMIC 67 +#define MPI_ERR_RMA_RANGE 68 +#define MPI_ERR_RMA_ATTACH 69 +#define MPI_ERR_RMA_FLAVOR 70 + /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined MPI_ERR_ code. So just set it equal to the last code -- - MPI_T_ERR_PVAR_NO_ATOMIC, in this case. */ -#define MPI_ERR_LASTCODE MPI_T_ERR_PVAR_NO_ATOMIC + MPI_ERR_RMA_FLAVOR, in this case. */ +#define MPI_ERR_LASTCODE MPI_ERR_RMA_FLAVOR #define MPI_ERR_SYSRESOURCE -2 @@ -888,6 +901,7 @@ OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_bxor; OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_maxloc; OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_minloc; OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_replace; +OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_no_op; OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_datatype_null; @@ -1019,6 +1033,7 @@ OMPI_DECLSPEC extern MPI_Fint *MPI_F_STATUSES_IGNORE; #define MPI_MAXLOC OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_maxloc) #define MPI_MINLOC OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_minloc) #define MPI_REPLACE OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_replace) +#define MPI_NO_OP OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_no_op) /* C datatypes */ #define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_datatype_null) @@ -1298,6 +1313,9 @@ OMPI_DECLSPEC int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], OMPI_DECLSPEC int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm); OMPI_DECLSPEC int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm); OMPI_DECLSPEC int MPI_Comm_test_inter(MPI_Comm comm, int *flag); +OMPI_DECLSPEC int MPI_Compare_and_swap(void *origin_addr, void *compare_addr, + void *result_addr, MPI_Datatype datatype, int target_rank, + MPI_Aint target_disp, MPI_Win win); OMPI_DECLSPEC int MPI_Dims_create(int nnodes, int ndims, int dims[]); OMPI_DECLSPEC MPI_Fint MPI_Errhandler_c2f(MPI_Errhandler errhandler); OMPI_DECLSPEC int MPI_Errhandler_create(MPI_Handler_function *function, @@ -1313,6 +1331,8 @@ OMPI_DECLSPEC int MPI_Error_class(int errorcode, int *errorclass); OMPI_DECLSPEC int MPI_Error_string(int errorcode, char *string, int *resultlen); OMPI_DECLSPEC int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +OMPI_DECLSPEC int MPI_Fetch_and_op(void *origin_addr, void *result_addr, MPI_Datatype datatype, + int target_rank, MPI_Aint target_disp, MPI_Op op, MPI_Win win); OMPI_DECLSPEC int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); #if OMPI_PROVIDE_MPI_FILE_INTERFACE @@ -1428,6 +1448,10 @@ OMPI_DECLSPEC int MPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win); OMPI_DECLSPEC int MPI_Get_library_version(char *version, int *resultlen); OMPI_DECLSPEC int MPI_Get_processor_name(char *name, int *resultlen); OMPI_DECLSPEC int MPI_Get_version(int *version, int *subversion); @@ -1575,6 +1599,9 @@ OMPI_DECLSPEC int MPI_Put(const void *origin_addr, int origin_count, MPI_Dataty int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win); OMPI_DECLSPEC int MPI_Query_thread(int *provided); +OMPI_DECLSPEC int MPI_Raccumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request); OMPI_DECLSPEC int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request); OMPI_DECLSPEC int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, @@ -1603,6 +1630,17 @@ OMPI_DECLSPEC MPI_Request MPI_Request_f2c(MPI_Fint request); OMPI_DECLSPEC int MPI_Request_free(MPI_Request *request); OMPI_DECLSPEC int MPI_Request_get_status(MPI_Request request, int *flag, MPI_Status *status); +OMPI_DECLSPEC int MPI_Rget(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, + MPI_Win win, MPI_Request *request); +OMPI_DECLSPEC int MPI_Rget_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, + MPI_Win win, MPI_Request *request); +OMPI_DECLSPEC int MPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_cout, + MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request); OMPI_DECLSPEC int MPI_Rsend(const void *ibuf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); OMPI_DECLSPEC int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, @@ -1768,39 +1806,55 @@ OMPI_DECLSPEC int MPI_Wait(MPI_Request *request, MPI_Status *status); OMPI_DECLSPEC int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); +OMPI_DECLSPEC int MPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win); +OMPI_DECLSPEC int MPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win); +OMPI_DECLSPEC int MPI_Win_attach(MPI_Win win, void *base, MPI_Aint size); OMPI_DECLSPEC MPI_Fint MPI_Win_c2f(MPI_Win win); OMPI_DECLSPEC int MPI_Win_call_errhandler(MPI_Win win, int errorcode); OMPI_DECLSPEC int MPI_Win_complete(MPI_Win win); OMPI_DECLSPEC int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, MPI_Win *win); +OMPI_DECLSPEC int MPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win); OMPI_DECLSPEC int MPI_Win_create_errhandler(MPI_Win_errhandler_function *function, MPI_Errhandler *errhandler); OMPI_DECLSPEC int MPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn, MPI_Win_delete_attr_function *win_delete_attr_fn, int *win_keyval, void *extra_state); OMPI_DECLSPEC int MPI_Win_delete_attr(MPI_Win win, int win_keyval); +OMPI_DECLSPEC int MPI_Win_detach(MPI_Win win, void *base); OMPI_DECLSPEC MPI_Win MPI_Win_f2c(MPI_Fint win); OMPI_DECLSPEC int MPI_Win_fence(int assert, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_flush(int rank, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_flush_all(MPI_Win win); +OMPI_DECLSPEC int MPI_Win_flush_local(int rank, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_flush_local_all(MPI_Win win); OMPI_DECLSPEC int MPI_Win_free(MPI_Win *win); OMPI_DECLSPEC int MPI_Win_free_keyval(int *win_keyval); OMPI_DECLSPEC int MPI_Win_get_attr(MPI_Win win, int win_keyval, void *attribute_val, int *flag); OMPI_DECLSPEC int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler); OMPI_DECLSPEC int MPI_Win_get_group(MPI_Win win, MPI_Group *group); +OMPI_DECLSPEC int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used); OMPI_DECLSPEC int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen); OMPI_DECLSPEC int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_lock_all(int assert, MPI_Win win); OMPI_DECLSPEC int MPI_Win_post(MPI_Group group, int assert, MPI_Win win); OMPI_DECLSPEC int MPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val); OMPI_DECLSPEC int MPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler); +OMPI_DECLSPEC int MPI_Win_set_info(MPI_Win win, MPI_Info info); OMPI_DECLSPEC int MPI_Win_set_name(MPI_Win win, const char *win_name); +OMPI_DECLSPEC int MPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr); OMPI_DECLSPEC int MPI_Win_start(MPI_Group group, int assert, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_sync(MPI_Win win); OMPI_DECLSPEC int MPI_Win_test(MPI_Win win, int *flag); OMPI_DECLSPEC int MPI_Win_unlock(int rank, MPI_Win win); +OMPI_DECLSPEC int MPI_Win_unlock_all(MPI_Win win); OMPI_DECLSPEC int MPI_Win_wait(MPI_Win win); OMPI_DECLSPEC double MPI_Wtick(void); OMPI_DECLSPEC double MPI_Wtime(void); - /* * Profiling MPI API */ @@ -1949,6 +2003,9 @@ OMPI_DECLSPEC int PMPI_Comm_spawn_multiple(int count, char *array_of_commands[] OMPI_DECLSPEC int PMPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm); OMPI_DECLSPEC int PMPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm); OMPI_DECLSPEC int PMPI_Comm_test_inter(MPI_Comm comm, int *flag); +OMPI_DECLSPEC int PMPI_Compare_and_swap(void *origin_addr, void *compare_addr, + void *result_addr, MPI_Datatype datatype, int target_rank, + MPI_Aint target_disp, MPI_Win win); OMPI_DECLSPEC int PMPI_Dims_create(int nnodes, int ndims, int dims[]); OMPI_DECLSPEC MPI_Fint PMPI_Errhandler_c2f(MPI_Errhandler errhandler); OMPI_DECLSPEC int PMPI_Errhandler_create(MPI_Handler_function *function, @@ -1964,6 +2021,8 @@ OMPI_DECLSPEC int PMPI_Error_class(int errorcode, int *errorclass); OMPI_DECLSPEC int PMPI_Error_string(int errorcode, char *string, int *resultlen); OMPI_DECLSPEC int PMPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); +OMPI_DECLSPEC int PMPI_Fetch_and_op(void *origin_addr, void *result_addr, MPI_Datatype datatype, + int target_rank, MPI_Aint target_disp, MPI_Op op, MPI_Win win); OMPI_DECLSPEC int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request); #if OMPI_PROVIDE_MPI_FILE_INTERFACE @@ -2081,6 +2140,10 @@ OMPI_DECLSPEC int PMPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win); +OMPI_DECLSPEC int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win); OMPI_DECLSPEC int PMPI_Get_library_version(char *version, int *resultlen); OMPI_DECLSPEC int PMPI_Get_processor_name(char *name, int *resultlen); OMPI_DECLSPEC int PMPI_Get_version(int *version, int *subversion); @@ -2228,6 +2291,9 @@ OMPI_DECLSPEC int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datat int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win); OMPI_DECLSPEC int PMPI_Query_thread(int *provided); +OMPI_DECLSPEC int PMPI_Raccumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request); OMPI_DECLSPEC int PMPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request); OMPI_DECLSPEC int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, @@ -2256,6 +2322,17 @@ OMPI_DECLSPEC MPI_Request PMPI_Request_f2c(MPI_Fint request); OMPI_DECLSPEC int PMPI_Request_free(MPI_Request *request); OMPI_DECLSPEC int PMPI_Request_get_status(MPI_Request request, int *flag, MPI_Status *status); +OMPI_DECLSPEC int PMPI_Rget(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, + MPI_Win win, MPI_Request *request); +OMPI_DECLSPEC int PMPI_Rget_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, + MPI_Win win, MPI_Request *request); +OMPI_DECLSPEC int PMPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_cout, + MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request); OMPI_DECLSPEC int PMPI_Rsend(const void *ibuf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); OMPI_DECLSPEC int PMPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, @@ -2421,34 +2498,51 @@ OMPI_DECLSPEC int PMPI_Wait(MPI_Request *request, MPI_Status *status); OMPI_DECLSPEC int PMPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]); +OMPI_DECLSPEC int PMPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win); +OMPI_DECLSPEC int PMPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win); +OMPI_DECLSPEC int PMPI_Win_attach(MPI_Win win, void *base, MPI_Aint size); OMPI_DECLSPEC MPI_Fint PMPI_Win_c2f(MPI_Win win); OMPI_DECLSPEC int PMPI_Win_call_errhandler(MPI_Win win, int errorcode); OMPI_DECLSPEC int PMPI_Win_complete(MPI_Win win); OMPI_DECLSPEC int PMPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, MPI_Win *win); +OMPI_DECLSPEC int PMPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win); OMPI_DECLSPEC int PMPI_Win_create_errhandler(MPI_Win_errhandler_function *function, MPI_Errhandler *errhandler); OMPI_DECLSPEC int PMPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn, MPI_Win_delete_attr_function *win_delete_attr_fn, int *win_keyval, void *extra_state); OMPI_DECLSPEC int PMPI_Win_delete_attr(MPI_Win win, int win_keyval); +OMPI_DECLSPEC int PMPI_Win_detach(MPI_Win win, void *base); OMPI_DECLSPEC MPI_Win PMPI_Win_f2c(MPI_Fint win); OMPI_DECLSPEC int PMPI_Win_fence(int assert, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_flush(int rank, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_flush_all(MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_flush_local(int rank, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_flush_local_all(MPI_Win win); OMPI_DECLSPEC int PMPI_Win_free(MPI_Win *win); OMPI_DECLSPEC int PMPI_Win_free_keyval(int *win_keyval); OMPI_DECLSPEC int PMPI_Win_get_attr(MPI_Win win, int win_keyval, void *attribute_val, int *flag); OMPI_DECLSPEC int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler); OMPI_DECLSPEC int PMPI_Win_get_group(MPI_Win win, MPI_Group *group); +OMPI_DECLSPEC int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used); OMPI_DECLSPEC int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen); OMPI_DECLSPEC int PMPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_lock_all(int assert, MPI_Win win); OMPI_DECLSPEC int PMPI_Win_post(MPI_Group group, int assert, MPI_Win win); OMPI_DECLSPEC int PMPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val); OMPI_DECLSPEC int PMPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler); +OMPI_DECLSPEC int PMPI_Win_set_info(MPI_Win win, MPI_Info info); OMPI_DECLSPEC int PMPI_Win_set_name(MPI_Win win, const char *win_name); +OMPI_DECLSPEC int PMPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr); OMPI_DECLSPEC int PMPI_Win_start(MPI_Group group, int assert, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_sync(MPI_Win win); OMPI_DECLSPEC int PMPI_Win_test(MPI_Win win, int *flag); OMPI_DECLSPEC int PMPI_Win_unlock(int rank, MPI_Win win); +OMPI_DECLSPEC int PMPI_Win_unlock_all(MPI_Win win); OMPI_DECLSPEC int PMPI_Win_wait(MPI_Win win); OMPI_DECLSPEC double PMPI_Wtick(void); OMPI_DECLSPEC double PMPI_Wtime(void); diff --git a/ompi/include/mpif-values.pl b/ompi/include/mpif-values.pl index b87205f93e..166423dde7 100755 --- a/ompi/include/mpif-values.pl +++ b/ompi/include/mpif-values.pl @@ -320,7 +320,10 @@ $constants->{MPI_T_ERR_CVAR_SET_NEVER} = 64; $constants->{MPI_T_ERR_PVAR_NO_STARTSTOP} = 65; $constants->{MPI_T_ERR_PVAR_NO_WRITE} = 66; $constants->{MPI_T_ERR_PVAR_NO_ATOMIC} = 67; -$constants->{MPI_ERR_LASTCODE} = $constants->{MPI_T_ERR_PVAR_NO_ATOMIC}; +$constants->{MPI_ERR_RMA_RANGE} = 68; +$constants->{MPI_ERR_RMA_ATTACH} = 69; +$constants->{MPI_ERR_RMA_FLAVOR} = 70; +$constants->{MPI_ERR_LASTCODE} = $constants->{MPI_ERR_RMA_FLAVOR}; $constants->{MPI_ERR_SYSRESOURCE} = -2; diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h index 99c239dc67..078b42838b 100644 --- a/ompi/mca/op/op.h +++ b/ompi/mca/op/op.h @@ -227,6 +227,8 @@ enum { OMPI_OP_BASE_FORTRAN_MINLOC, /** Corresponds to Fortran MPI_REPLACE */ OMPI_OP_BASE_FORTRAN_REPLACE, + /** Corresponds to Fortran MPI_NO_OP */ + OMPI_OP_BASE_FORTRAN_NO_OP, /** Maximum value */ OMPI_OP_BASE_FORTRAN_OP_MAX diff --git a/ompi/mca/osc/base/base.h b/ompi/mca/osc/base/base.h index b6c7148cc2..680942e601 100644 --- a/ompi/mca/osc/base/base.h +++ b/ompi/mca/osc/base/base.h @@ -37,8 +37,13 @@ int ompi_osc_base_find_available(bool enable_progress_threads, bool enable_mpi_threads); int ompi_osc_base_select(ompi_win_t *win, + void **base, + size_t size, + int disp_unit, + ompi_communicator_t *comm, ompi_info_t *info, - ompi_communicator_t *comm); + int flavor, + int *model); int ompi_osc_base_finalize(void); diff --git a/ompi/mca/osc/base/osc_base_init.c b/ompi/mca/osc/base/osc_base_init.c index 8e02ac00c4..f322f74f4b 100644 --- a/ompi/mca/osc/base/osc_base_init.c +++ b/ompi/mca/osc/base/osc_base_init.c @@ -27,8 +27,13 @@ int ompi_osc_base_select(ompi_win_t *win, - ompi_info_t *info, - ompi_communicator_t *comm) + void **base, + size_t size, + int disp_unit, + ompi_communicator_t *comm, + ompi_info_t *info, + int flavor, + int *model) { opal_list_item_t *item; ompi_osc_base_component_t *best_component = NULL; @@ -45,7 +50,7 @@ ompi_osc_base_select(ompi_win_t *win, ompi_osc_base_component_t *component = (ompi_osc_base_component_t*) ((mca_base_component_list_item_t*) item)->cli_component; - priority = component->osc_query(win, info, comm); + priority = component->osc_query(win, base, size, disp_unit, comm, info, flavor); if (priority < 0) continue; if (priority > best_priority) { best_component = component; @@ -55,5 +60,5 @@ ompi_osc_base_select(ompi_win_t *win, if (NULL == best_component) return OMPI_ERR_NOT_SUPPORTED; - return best_component->osc_select(win, info, comm); + return best_component->osc_select(win, base, size, disp_unit, comm, info, flavor, model); } diff --git a/ompi/mca/osc/base/osc_base_obj_convert.c b/ompi/mca/osc/base/osc_base_obj_convert.c index 09fa9fe68d..cad8043176 100644 --- a/ompi/mca/osc/base/osc_base_obj_convert.c +++ b/ompi/mca/osc/base/osc_base_obj_convert.c @@ -239,3 +239,65 @@ ompi_osc_base_process_op(void *outbuf, return OMPI_SUCCESS; } + + + +int +ompi_osc_base_sndrcv_op(void *origin, + int32_t origin_count, + struct ompi_datatype_t *origin_dt, + void *target, + int32_t target_count, + struct ompi_datatype_t *target_dt, + ompi_op_t *op) +{ + if (ompi_datatype_is_predefined(origin_dt) && origin_dt == target_dt) { + ompi_op_reduce(op, origin, target, origin_count, origin_dt); + } else { + ompi_osc_base_convertor_t recv_convertor; + opal_convertor_t send_convertor; + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data; + int completed, length; + struct opal_convertor_master_t master = {NULL, 0, 0, 0, {0, }, NULL}; + + /* initialize send convertor */ + OBJ_CONSTRUCT(&send_convertor, opal_convertor_t); + opal_convertor_copy_and_prepare_for_send(ompi_proc_local()->proc_convertor, + &(origin_dt->super), origin_count, origin, 0, + &send_convertor); + + /* initialize recv convertor */ + OBJ_CONSTRUCT(&recv_convertor, ompi_osc_base_convertor_t); + recv_convertor.op = op; + recv_convertor.datatype = ompi_datatype_get_single_predefined_type_from_args(target_dt); + opal_convertor_copy_and_prepare_for_recv(ompi_proc_local()->proc_convertor, + &(target_dt->super), target_count, + target, 0, &recv_convertor.convertor); + + memcpy(&master, recv_convertor.convertor.master, sizeof(struct opal_convertor_master_t)); + master.next = recv_convertor.convertor.master; + master.pFunctions = (conversion_fct_t*) &ompi_osc_base_copy_functions; + recv_convertor.convertor.master = &master; + recv_convertor.convertor.fAdvance = opal_unpack_general; + + /* copy */ + iov.iov_len = length = 64 * 1024; + iov.iov_base = (IOVBASE_TYPE*)malloc( length * sizeof(char) ); + + completed = 0; + while(0 == completed) { + iov.iov_len = length; + iov_count = 1; + max_data = length; + completed |= opal_convertor_pack( &send_convertor, &iov, &iov_count, &max_data ); + completed |= opal_convertor_unpack( &recv_convertor.convertor, &iov, &iov_count, &max_data ); + } + free( iov.iov_base ); + OBJ_DESTRUCT( &send_convertor ); + OBJ_DESTRUCT( &recv_convertor ); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/base/osc_base_obj_convert.h b/ompi/mca/osc/base/osc_base_obj_convert.h index d5b587791e..7d36c33b32 100644 --- a/ompi/mca/osc/base/osc_base_obj_convert.h +++ b/ompi/mca/osc/base/osc_base_obj_convert.h @@ -117,4 +117,12 @@ OMPI_DECLSPEC int ompi_osc_base_process_op(void *outbuf, int count, ompi_op_t *op); +OMPI_DECLSPEC int ompi_osc_base_sndrcv_op(void *origin, + int32_t origin_count, + struct ompi_datatype_t *origin_dt, + void *target, + int32_t target_count, + struct ompi_datatype_t *target_dt, + ompi_op_t *op); + END_C_DECLS diff --git a/ompi/mca/osc/osc.h b/ompi/mca/osc/osc.h index eb6c0a56ec..dd116bf391 100644 --- a/ompi/mca/osc/osc.h +++ b/ompi/mca/osc/osc.h @@ -49,7 +49,7 @@ struct ompi_communicator_t; struct ompi_group_t; struct ompi_datatype_t; struct ompi_op_t; - +struct ompi_request_t; /* ******************************************************************** */ @@ -111,9 +111,12 @@ typedef int (*ompi_osc_base_component_finalize_fn_t)(void); * @retval >= 0 The priority of the component for this window */ typedef int (*ompi_osc_base_component_query_fn_t)(struct ompi_win_t *win, + void **base, + size_t size, + int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, - struct ompi_communicator_t *comm); - + int flavor); /** * OSC component select @@ -140,9 +143,13 @@ typedef int (*ompi_osc_base_component_query_fn_t)(struct ompi_win_t *win, * @retval OMPI_ERROR An unspecified error occurred */ typedef int (*ompi_osc_base_component_select_fn_t)(struct ompi_win_t *win, + void **base, + size_t size, + int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, - struct ompi_communicator_t *comm); - + int flavor, + int *model); /** * OSC component interface @@ -171,6 +178,11 @@ typedef ompi_osc_base_component_2_0_0_t ompi_osc_base_component_t; /* ******************************************************************** */ +typedef int (*ompi_osc_base_module_win_shared_query_fn_t)(struct ompi_win_t *win, int rank, + size_t *size, int *disp_unit, void *baseptr); + +typedef int (*ompi_osc_base_module_win_attach_fn_t)(struct ompi_win_t *win, void *base, size_t size); +typedef int (*ompi_osc_base_module_win_detach_fn_t)(struct ompi_win_t *win, void *base); /** * Free resources associated with win @@ -220,6 +232,80 @@ typedef int (*ompi_osc_base_module_accumulate_fn_t)(void *origin_addr, struct ompi_op_t *op, struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_compare_and_swap_fn_t)(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_fetch_and_op_fn_t)(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, + struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_get_accumulate_fn_t)(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_rput_fn_t)(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +typedef int (*ompi_osc_base_module_rget_fn_t)(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + + +typedef int (*ompi_osc_base_module_raccumulate_fn_t)(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +typedef int (*ompi_osc_base_module_rget_accumulate_fn_t)(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); typedef int (*ompi_osc_base_module_fence_fn_t)(int assert, struct ompi_win_t *win); @@ -249,10 +335,26 @@ typedef int (*ompi_osc_base_module_lock_fn_t)(int lock_type, int assert, struct ompi_win_t *win); - typedef int (*ompi_osc_base_module_unlock_fn_t)(int target, struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_lock_all_fn_t)(int assert, + struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_unlock_all_fn_t)(struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_sync_fn_t)(struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_flush_fn_t)(int target, + struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_flush_all_fn_t)(struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_flush_local_fn_t)(int target, + struct ompi_win_t *win); +typedef int (*ompi_osc_base_module_flush_local_all_fn_t)(struct ompi_win_t *win); + +typedef int (*ompi_osc_base_module_set_info_fn_t)(struct ompi_win_t *win, struct ompi_info_t *info); +typedef int (*ompi_osc_base_module_get_info_fn_t)(struct ompi_win_t *win, struct ompi_info_t **info_used); + + /* ******************************************************************** */ @@ -266,47 +368,58 @@ typedef int (*ompi_osc_base_module_unlock_fn_t)(int target, * free to create a structure that inherits this one for use as the * module structure. */ -struct ompi_osc_base_module_1_0_0_t { - /** Free resources associated with the window */ +struct ompi_osc_base_module_3_0_0_t { + ompi_osc_base_module_win_shared_query_fn_t osc_win_shared_query; + + ompi_osc_base_module_win_attach_fn_t osc_win_attach; + ompi_osc_base_module_win_detach_fn_t osc_win_detach; ompi_osc_base_module_free_fn_t osc_free; - /** Implement MPI_PUT */ ompi_osc_base_module_put_fn_t osc_put; - /** Implement MPI_GET */ ompi_osc_base_module_get_fn_t osc_get; - /** Implement MPI_ACCUMULATE */ ompi_osc_base_module_accumulate_fn_t osc_accumulate; + ompi_osc_base_module_compare_and_swap_fn_t osc_compare_and_swap; + ompi_osc_base_module_fetch_and_op_fn_t osc_fetch_and_op; + ompi_osc_base_module_get_accumulate_fn_t osc_get_accumulate; + + ompi_osc_base_module_rput_fn_t osc_rput; + ompi_osc_base_module_rget_fn_t osc_rget; + ompi_osc_base_module_raccumulate_fn_t osc_raccumulate; + ompi_osc_base_module_rget_accumulate_fn_t osc_rget_accumulate; - /** Implement MPI_WIN_FENCE */ ompi_osc_base_module_fence_fn_t osc_fence; - /* Implement MPI_WIN_START */ ompi_osc_base_module_start_fn_t osc_start; - /* Implement MPI_WIN_COMPLETE */ ompi_osc_base_module_complete_fn_t osc_complete; - /* Implement MPI_WIN_POST */ ompi_osc_base_module_post_fn_t osc_post; - /* Implement MPI_WIN_WAIT */ ompi_osc_base_module_wait_fn_t osc_wait; - /* Implement MPI_WIN_TEST */ ompi_osc_base_module_test_fn_t osc_test; - /* Implement MPI_WIN_LOCK */ ompi_osc_base_module_lock_fn_t osc_lock; - /* Implement MPI_WIN_UNLOCK */ ompi_osc_base_module_unlock_fn_t osc_unlock; + ompi_osc_base_module_lock_all_fn_t osc_lock_all; + ompi_osc_base_module_unlock_all_fn_t osc_unlock_all; + + ompi_osc_base_module_sync_fn_t osc_sync; + ompi_osc_base_module_flush_fn_t osc_flush; + ompi_osc_base_module_flush_all_fn_t osc_flush_all; + ompi_osc_base_module_flush_local_fn_t osc_flush_local; + ompi_osc_base_module_flush_local_all_fn_t osc_flush_local_all; + + ompi_osc_base_module_set_info_fn_t osc_set_info; + ompi_osc_base_module_get_info_fn_t osc_get_info; }; -typedef struct ompi_osc_base_module_1_0_0_t ompi_osc_base_module_1_0_0_t; -typedef ompi_osc_base_module_1_0_0_t ompi_osc_base_module_t; +typedef struct ompi_osc_base_module_3_0_0_t ompi_osc_base_module_3_0_0_t; +typedef ompi_osc_base_module_3_0_0_t ompi_osc_base_module_t; /* ******************************************************************** */ /** Macro for use in components that are of type osc */ -#define OMPI_OSC_BASE_VERSION_2_0_0 \ +#define OMPI_OSC_BASE_VERSION_3_0_0 \ MCA_BASE_VERSION_2_0_0, \ - "osc", 2, 0, 0 + "osc", 3, 0, 0 /* ******************************************************************** */ diff --git a/ompi/mca/osc/portals4/Makefile.am b/ompi/mca/osc/portals4/Makefile.am new file mode 100644 index 0000000000..2631babf2f --- /dev/null +++ b/ompi/mca/osc/portals4/Makefile.am @@ -0,0 +1,43 @@ +# +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = + +portals4_sources = \ + osc_portals4.h \ + osc_portals4_comm.c \ + osc_portals4_component.c \ + osc_portals4_active_target.c \ + osc_portals4_passive_target.c \ + osc_portals4_request.c + +AM_CPPFLAGS = $(osc_portals4_CPPFLAGS) + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_osc_portals4_DSO +component_noinst = +component_install = mca_osc_portals4.la +else +component_noinst = libmca_osc_portals4.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_osc_portals4_la_SOURCES = $(portals4_sources) +mca_osc_portals4_la_LIBADD = $(osc_portals4_LIBS) +mca_osc_portals4_la_LDFLAGS = -module -avoid-version $(osc_portals4_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_osc_portals4_la_SOURCES = $(portals4_sources) +libmca_osc_portals4_la_LIBADD = $(osc_portals4_LIBS) +libmca_osc_portals4_la_LDFLAGS = -module -avoid-version $(osc_portals4_LDFLAGS) diff --git a/ompi/mca/osc/portals4/configure.m4 b/ompi/mca/osc/portals4/configure.m4 new file mode 100644 index 0000000000..5e1b66ee79 --- /dev/null +++ b/ompi/mca/osc/portals4/configure.m4 @@ -0,0 +1,42 @@ +# -*- shell-script -*- +# +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_osc_portals4_POST_CONFIG(will_build) +# ---------------------------------------- +# Only require the tag if we're actually going to be built +AC_DEFUN([MCA_ompi_osc_portals4_POST_CONFIG], [ + AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([PORTALS4])]) +])dnl + +# MCA_osc_portals4_CONFIG(action-if-can-compile, +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_ompi_osc_portals4_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/osc/portals4/Makefile]) + + OMPI_CHECK_PORTALS4([osc_portals4], + [osc_portals4_happy="yes"], + [osc_portals4_happy="no"]) + + AS_IF([test "$osc_portals4_happy" = "yes"], + [osc_portals4_WRAPPER_EXTRA_LDFLAGS="$osc_portals4_LDFLAGS" + osc_portals4_WRAPPER_EXTRA_LIBS="$osc_portals4_LIBS" + $1], + [$2]) + + # need to propogate CPPFLAGS to all of OMPI + AS_IF([test "$DIRECT_osc" = "portals4"], + [CPPFLAGS="$CPPFLAGS $osc_portals4_CPPFLAGS"]) + + # substitute in the things needed to build portals4 + AC_SUBST([osc_portals4_CPPFLAGS]) + AC_SUBST([osc_portals4_LDFLAGS]) + AC_SUBST([osc_portals4_LIBS]) +])dnl diff --git a/ompi/mca/osc/portals4/osc_portals4.h b/ompi/mca/osc/portals4/osc_portals4.h new file mode 100644 index 0000000000..884e234c7d --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4.h @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2011-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_PORTALS4_PORTALS4_H +#define OSC_PORTALS4_PORTALS4_H + +#include +#include "ompi/class/ompi_free_list.h" +#include "ompi/group/group.h" +#include "ompi/communicator/communicator.h" + +#define OSC_PORTALS4_MB_DATA 0x0000000000000000ULL +#define OSC_PORTALS4_MB_CONTROL 0x1000000000000000ULL + +/* Component structure. There is one of these per process, per process lifetime. + * + * Currently, the Portals 4 one-sided implementation only uses a + * matching interface for all communication. There are plans for + * using a non-matching interface for a few windows (they each need + * their own PTE, which is a precious resource). In anticipation of + * that, we initialize the network interfaces and keep them in the + * component structure (for win create), but then also keep a handle + * copy in the window module, so that we can use the right structures + * once we add the non-matching support. + * + * The sizes are kept in the component structure because we can only + * find them during PtlNIInit, and it would be poor to do that for + * every window creation. Again, the window module has a copy of the + * max sizes, but tweaked to match the window configuration (ie, + * there's one atomic size, instead of an ordered and unordered size, + * since we know the ordering constraints during window creation). + */ +struct ompi_osc_portals4_component_t { + ompi_osc_base_component_t super; + + ptl_handle_ni_t matching_ni_h; + ptl_handle_eq_t matching_eq_h; + ptl_pt_index_t matching_pt_idx; + ptl_size_t matching_atomic_max; + ptl_size_t matching_fetch_atomic_max; + ptl_size_t matching_atomic_ordered_size; + + ompi_free_list_t requests; /* request free list for the r* communication variants */ +}; +typedef struct ompi_osc_portals4_component_t ompi_osc_portals4_component_t; +OMPI_DECLSPEC extern ompi_osc_portals4_component_t mca_osc_portals4_component; + +/* Data about me exposed to remote peers. Used in generalized active + target and passive target synchronization. */ +struct ompi_osc_portals4_node_state_t { + volatile int32_t post_count; + volatile int32_t complete_count; + volatile uint64_t lock; +}; +typedef struct ompi_osc_portals4_node_state_t ompi_osc_portals4_node_state_t; + +#define LOCK_ILLEGAL (0x4000000000000000ULL) +#define LOCK_UNLOCKED (0x0000000000000000ULL) +#define LOCK_EXCLUSIVE (0x0000000100000000ULL) + +/* Module structure. There is one of these per window */ +struct ompi_osc_portals4_module_t { + ompi_osc_base_module_t super; + void *free_after; /* if non-null, this pointer should be free()ed when window destroyed */ + struct ompi_communicator_t *comm; /* communicator which backs this window (unique to this window) */ + int disp_unit; /* if -1, have to look at disp_units */ + int *disp_units; /* array (possibly NULL!) of displacement units, per peer */ + ptl_handle_ni_t ni_h; /* network interface used by this window */ + ptl_pt_index_t pt_idx; /* portal table index used by this window (this will be same across window) */ + ptl_handle_ct_t ct_h; /* Counting event handle used for completion in this window */ +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + ptl_handle_md_t *md_h; /* memory descriptor describing all of memory used by this window */ + ptl_handle_md_t *req_md_h; /* memory descriptor with event completion used by this window */ +#else + ptl_handle_md_t md_h[1]; /* memory descriptor describing all of memory used by this window */ + ptl_handle_md_t req_md_h[1]; /* memory descriptor with event completion used by this window */ +#endif + ptl_handle_me_t data_me_h; /* data match list entry (MB are CID | OSC_PORTALS4_MB_DATA) */ + ptl_handle_me_t control_me_h; /* match list entry for control data (node_state_t). Match bits are (CID | OSC_PORTALS4_MB_CONTROL). */ + int64_t opcount; + ptl_match_bits_t match_bits; /* match bits for module. Same as cid for comm in most cases. */ + + ptl_size_t atomic_max; /* max size of atomic messages. Will guarantee ordering IF ordering requested */ + ptl_size_t fetch_atomic_max; /* max size of fetchatomic messages. Will guarantee ordering IF ordering requested */ + + /* variable containing specified value. Needed for atomic + increments so they can be non-blocking */ + int32_t zero; + int32_t one; + + ompi_group_t *start_group; + ompi_group_t *post_group; + opal_list_t outstanding_locks; + + /* things that are remotely accessible */ + ompi_osc_portals4_node_state_t state; +}; +typedef struct ompi_osc_portals4_module_t ompi_osc_portals4_module_t; + + +static inline size_t +get_displacement(ompi_osc_portals4_module_t *module, + int target) +{ + if (-1 == module->disp_unit) { + return module->disp_units[target]; + } else { + return module->disp_unit; + } +} + + +/* + * See note in ompi/mtl/portals4/mtl_portals4.h for how we deal with + * platforms that don't allow us to crate an MD that covers all of + * memory. + */ +static inline void +ompi_osc_portals4_get_md(const void *ptr, const ptl_handle_md_t *array, + ptl_handle_md_t *md_h, void **base_ptr) +{ +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + int mask = (1ULL << (OMPI_PORTALS4_MAX_VA_SIZE - OMPI_PORTALS4_MAX_MD_SIZE + 1)) - 1; + int which = (((uintptr_t) ptr) >> (OMPI_PORTALS4_MAX_MD_SIZE - 1)) & mask; + *md_h = array[which]; + *base_ptr = (void*) (which * (1ULL << (OMPI_PORTALS4_MAX_MD_SIZE - 1))); +#else + *md_h = array[0]; + *base_ptr = 0; +#endif +} + + +static inline int +ompi_osc_portals4_get_num_mds(void) +{ +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + return (1 << (OMPI_PORTALS4_MAX_VA_SIZE - OMPI_PORTALS4_MAX_MD_SIZE + 1)); +#else + return 1; +#endif +} + + + +int ompi_osc_portals4_attach(struct ompi_win_t *win, void *base, size_t len); +int ompi_osc_portals4_detach(struct ompi_win_t *win, void *base); + +int ompi_osc_portals4_free(struct ompi_win_t *win); + +int ompi_osc_portals4_put(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_portals4_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_portals4_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_portals4_compare_and_swap(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win); + +int ompi_osc_portals4_fetch_and_op(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_portals4_get_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_portals4_rput(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_portals4_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_portals4_raccumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_portals4_rget_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_portals4_fence(int assert, struct ompi_win_t *win); + +int ompi_osc_portals4_start(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win); + +int ompi_osc_portals4_complete(struct ompi_win_t *win); + +int ompi_osc_portals4_post(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win); + +int ompi_osc_portals4_wait(struct ompi_win_t *win); + +int ompi_osc_portals4_test(struct ompi_win_t *win, + int *flag); + +int ompi_osc_portals4_lock(int lock_type, + int target, + int assert, + struct ompi_win_t *win); + +int ompi_osc_portals4_unlock(int target, + struct ompi_win_t *win); + + +int ompi_osc_portals4_lock_all(int assert, + struct ompi_win_t *win); + +int ompi_osc_portals4_unlock_all(struct ompi_win_t *win); + +int ompi_osc_portals4_sync(struct ompi_win_t *win); + +int ompi_osc_portals4_flush(int target, + struct ompi_win_t *win); +int ompi_osc_portals4_flush_all(struct ompi_win_t *win); +int ompi_osc_portals4_flush_local(int target, + struct ompi_win_t *win); +int ompi_osc_portals4_flush_local_all(struct ompi_win_t *win); + +int ompi_osc_portals4_set_info(struct ompi_win_t *win, struct ompi_info_t *info); +int ompi_osc_portals4_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used); + +static inline int +ompi_osc_portals4_complete_all(ompi_osc_portals4_module_t *module) +{ + int ret; + ptl_ct_event_t event; + + ret = PtlCTWait(module->ct_h, module->opcount, &event); + if (PTL_OK != ret || 0 != event.failure) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: flush_all ct failure: ret=%d, failure=%d\n", + __FILE__, __LINE__, ret, (int) event.failure); + event.success = event.failure = 0; + PtlCTSet(module->ct_h, event); + module->opcount = 0; + } + assert(event.success == (size_t) module->opcount); + + PtlAtomicSync(); + + return ret; +} + +static inline ptl_process_t +ompi_osc_portals4_get_peer(ompi_osc_portals4_module_t *module, int rank) +{ + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, rank); + return *((ptl_process_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]); +} + +static inline ptl_process_t +ompi_osc_portals4_get_peer_group(struct ompi_group_t *group, int rank) +{ + ompi_proc_t *proc = ompi_group_get_proc_ptr(group, rank); + return *((ptl_process_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]); +} + +#endif diff --git a/ompi/mca/osc/portals4/osc_portals4_active_target.c b/ompi/mca/osc/portals4/osc_portals4_active_target.c new file mode 100644 index 0000000000..19c7939441 --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_active_target.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_portals4.h" + +#include "ompi/mca/mtl/portals4/mtl_portals4_endpoint.h" + + +int +ompi_osc_portals4_fence(int assert, struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + int comm_ret, ret; + + comm_ret = ompi_osc_portals4_complete_all(module); + + ret = module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + return (OMPI_SUCCESS == comm_ret) ? ret : comm_ret; +} + + +int +ompi_osc_portals4_start(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + int size; + + OBJ_RETAIN(group); + module->start_group = group; + size = ompi_group_size(module->start_group); + + while (module->state.post_count != size) opal_progress(); + } else { + module->start_group = NULL; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_complete(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + int ret, i, size; + ptl_handle_md_t md_h; + void *base; + + ret = ompi_osc_portals4_complete_all(module); + if (ret != OMPI_SUCCESS) return ret; + + if (NULL != module->start_group) { + module->state.post_count = 0; + PtlAtomicSync(); + + ompi_osc_portals4_get_md(&module->one, module->md_h, &md_h, &base); + + size = ompi_group_size(module->start_group); + for (i = 0 ; i < size ; ++i) { + + ret = PtlAtomic(md_h, + (ptl_size_t) ((char*) &module->one - (char*) base), + sizeof(module->one), + PTL_ACK_REQ, + ompi_osc_portals4_get_peer_group(module->start_group, i), + module->pt_idx, + module->match_bits | OSC_PORTALS4_MB_CONTROL, + offsetof(ompi_osc_portals4_node_state_t, complete_count), + NULL, + 0, + PTL_SUM, + PTL_INT32_T); + if (ret != OMPI_SUCCESS) return ret; + OPAL_THREAD_ADD64(&module->opcount, 1); + } + + ret = ompi_osc_portals4_complete_all(module); + if (ret != OMPI_SUCCESS) return ret; + + OBJ_RELEASE(module->start_group); + module->start_group = NULL; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_post(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + int ret, i, size; + ptl_handle_md_t md_h; + void *base; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + OBJ_RETAIN(group); + module->post_group = group; + + module->state.complete_count = 0; + PtlAtomicSync(); + + ompi_osc_portals4_get_md(&module->one, module->md_h, &md_h, &base); + + size = ompi_group_size(module->post_group); + for (i = 0 ; i < size ; ++i) { + ret = PtlAtomic(md_h, + (ptl_size_t) ((char*) &module->one - (char*) base), + sizeof(module->one), + PTL_ACK_REQ, + ompi_osc_portals4_get_peer_group(module->post_group, i), + module->pt_idx, + module->match_bits | OSC_PORTALS4_MB_CONTROL, + offsetof(ompi_osc_portals4_node_state_t, post_count), + NULL, + 0, + PTL_SUM, + PTL_INT32_T); + if (ret != OMPI_SUCCESS) return ret; + OPAL_THREAD_ADD64(&module->opcount, 1); + } + } else { + module->post_group = NULL; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_wait(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + if (NULL != module->post_group) { + int size = ompi_group_size(module->post_group); + + while (module->state.complete_count != size) opal_progress(); + + OBJ_RELEASE(module->post_group); + module->post_group = NULL; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_test(struct ompi_win_t *win, + int *flag) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + if (NULL != module->post_group) { + int size = ompi_group_size(module->post_group); + + if (module->state.complete_count == size) { + OBJ_RELEASE(module->post_group); + module->post_group = NULL; + *flag = 1; + } + } else { + *flag = 0; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/portals4/osc_portals4_comm.c b/ompi/mca/osc/portals4/osc_portals4_comm.c new file mode 100644 index 0000000000..02be172a2a --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_comm.c @@ -0,0 +1,1152 @@ +/* + * Copyright (c) 2011-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_portals4.h" +#include "osc_portals4_request.h" + +#include "ompi/mca/mtl/portals4/mtl_portals4_endpoint.h" + + +static int +ompi_osc_portals4_get_op(struct ompi_op_t *op, ptl_op_t *ptl_op) +{ + if (MPI_MAX == op) { + *ptl_op = PTL_MAX; + } else if (MPI_MIN == op) { + *ptl_op = PTL_MIN; + } else if (MPI_SUM == op) { + *ptl_op = PTL_SUM; + } else if (MPI_PROD == op) { + *ptl_op = PTL_PROD; + } else if (MPI_LAND == op) { + *ptl_op = PTL_LAND; + } else if (MPI_BAND == op) { + *ptl_op = PTL_BAND; + } else if (MPI_LOR == op) { + *ptl_op = PTL_LOR; + } else if (MPI_BOR == op) { + *ptl_op = PTL_BOR; + } else if (MPI_LXOR == op) { + *ptl_op = PTL_LXOR; + } else if (MPI_BXOR == op) { + *ptl_op = PTL_BXOR; + } else { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + + +static int +get_sized_type(bool sign, size_t size, ptl_datatype_t *ptl_dt) +{ + if (sign) { + switch (size) { + case 1: + *ptl_dt = PTL_INT8_T; + break; + case 2: + *ptl_dt = PTL_INT16_T; + break; + case 4: + *ptl_dt = PTL_INT32_T; + break; + case 8: + *ptl_dt = PTL_INT64_T; + break; + default: + return OMPI_ERROR; + } + } else { + switch (size) { + case 1: + *ptl_dt = PTL_UINT8_T; + break; + case 2: + *ptl_dt = PTL_UINT16_T; + break; + case 4: + *ptl_dt = PTL_UINT32_T; + break; + case 8: + *ptl_dt = PTL_UINT64_T; + break; + default: + return OMPI_ERROR; + } + } + + return OMPI_SUCCESS; +} + + +static int +ompi_osc_portals4_get_dt(struct ompi_datatype_t *dt, ptl_datatype_t *ptl_dt) +{ + ompi_datatype_t *base_dt = ompi_datatype_get_single_predefined_type_from_args(dt); + + if (MPI_BYTE == base_dt) { + *ptl_dt = PTL_INT8_T; + } else if (MPI_CHAR == base_dt) { + *ptl_dt = PTL_INT8_T; + } else if (MPI_SHORT == base_dt) { + return get_sized_type(true, sizeof(short), ptl_dt); + } else if (MPI_INT == base_dt) { + return get_sized_type(true, sizeof(int), ptl_dt); + } else if (MPI_LONG == base_dt) { + return get_sized_type(true, sizeof(long), ptl_dt); + } else if (MPI_FLOAT == base_dt) { + *ptl_dt = PTL_FLOAT; + } else if (MPI_DOUBLE == base_dt) { + *ptl_dt = PTL_DOUBLE; + } else if (MPI_LONG_DOUBLE == base_dt) { + *ptl_dt = PTL_LONG_DOUBLE; + } else if (MPI_UNSIGNED_CHAR == base_dt) { + *ptl_dt = PTL_UINT8_T; + } else if (MPI_SIGNED_CHAR == base_dt) { + *ptl_dt = PTL_UINT8_T; + } else if (MPI_UNSIGNED_SHORT == base_dt) { + return get_sized_type(false, sizeof(short), ptl_dt); + } else if (MPI_UNSIGNED_LONG == base_dt) { + return get_sized_type(false, sizeof(long), ptl_dt); + } else if (MPI_UNSIGNED == base_dt) { + return get_sized_type(false, sizeof(int), ptl_dt); +#if OPAL_HAVE_LONG_LONG + } else if (MPI_LONG_LONG_INT == base_dt) { + return get_sized_type(true, sizeof(long long int), ptl_dt); + } else if (MPI_LONG_LONG == base_dt) { + return get_sized_type(true, sizeof(long long), ptl_dt); +#endif + } else if (MPI_INT8_T == base_dt) { + *ptl_dt = PTL_INT8_T; + } else if (MPI_UINT8_T == base_dt) { + *ptl_dt = PTL_UINT8_T; + } else if (MPI_INT16_T == base_dt) { + *ptl_dt = PTL_INT16_T; + } else if (MPI_UINT16_T == base_dt) { + *ptl_dt = PTL_UINT16_T; + } else if (MPI_INT32_T == base_dt) { + *ptl_dt = PTL_INT32_T; + } else if (MPI_UINT32_T == base_dt) { + *ptl_dt = PTL_UINT32_T; + } else if (MPI_INT64_T == base_dt) { + *ptl_dt = PTL_INT64_T; + } else if (MPI_UINT64_T == base_dt) { + *ptl_dt = PTL_UINT64_T; +#if HAVE_FLOAT__COMPLEX + } else if (MPI_C_COMPLEX == base_dt) { + *ptl_dt = PTL_DOUBLE_COMPLEX; + } else if (MPI_C_FLOAT_COMPLEX == base_dt) { + *ptl_dt = PTL_FLOAT_COMPLEX; +#endif +#if HAVE_DOUBLE__COMPLEX + } else if (MPI_C_DOUBLE_COMPLEX == base_dt) { + *ptl_dt = PTL_DOUBLE_COMPLEX; +#endif +#if HAVE_LONG_DOUBLE__COMPLEX + } else if (MPI_C_LONG_DOUBLE_COMPLEX == base_dt) { + *ptl_dt = PTL_LONG_DOUBLE_COMPLEX; +#endif + } else if (MPI_AINT == base_dt) { + if (sizeof(MPI_Aint) == 2) { + *ptl_dt = PTL_UINT16_T; + } else if (sizeof(MPI_Aint) == 4) { + *ptl_dt = PTL_UINT32_T; + } else if (sizeof(MPI_Aint) == 8) { + *ptl_dt = PTL_UINT64_T; + } + } else { + return OMPI_ERROR; + } + + return 0; +} + + +int +ompi_osc_portals4_rput(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_portals4_request_t *request; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rput: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Rput: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected = 1; + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= origin_count; + ompi_osc_portals4_get_md(origin_addr, module->req_md_h, &md_h, &md_base); + ret = PtlPut(md_h, + (ptl_size_t) ((char*) origin_addr - (char*) md_base), + length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset, + request, + 0); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_portals4_request_t *request; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Rget: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected = 1; + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= origin_count; + ompi_osc_portals4_get_md(origin_addr, module->req_md_h, &md_h, &md_base); + ret = PtlGet(md_h, + (ptl_size_t) ((char*) origin_addr - (char*) md_base), + length, + peer, + module->pt_idx, + module->match_bits, + offset, + request); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_raccumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_portals4_request_t *request; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length, sent; + size_t offset; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dt; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Raccumulate: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + ptl_size_t md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= origin_count; + sent = 0; + + ompi_osc_portals4_get_md(origin_addr, module->req_md_h, &md_h, &md_base); + md_offset = ((char*) origin_addr - (char*) md_base); + + do { + size_t msg_length = MIN(module->atomic_max, length - sent); + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected++; + + if (MPI_REPLACE == op) { + ret = PtlPut(md_h, + md_offset + sent, + msg_length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + request, + 0); + } else { + ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_osc_portals4_get_op(op, &ptl_op); + if (OMPI_SUCCESS != ret) return ret; + + ret = PtlAtomic(md_h, + offset + sent, + msg_length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + request, + 0, + ptl_op, + ptl_dt); + } + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + sent += msg_length; + } while (sent < length); + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_rget_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_dt, + int target, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_portals4_request_t *request; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length, sent; + size_t offset; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dt; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, (unsigned long) result_addr, + result_count, result_dt->name, + target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Rget_accumulate: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + sent = 0; + + if (MPI_REPLACE == op) { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= origin_count; + + ompi_osc_portals4_get_md(result_addr, module->req_md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected++; + + ret = PtlSwap(result_md_h, + result_md_offset + sent, + origin_md_h, + origin_md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + request, + 0, + NULL, + PTL_SWAP, + ptl_dt); + sent += msg_length; + } while (sent < length); + } else if (MPI_NO_OP == op) { + ptl_handle_md_t md_h; + void *md_base; + ptl_size_t md_offset; + + ret = ompi_datatype_type_size(target_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= target_count; + + ompi_osc_portals4_get_md(result_addr, module->req_md_h, &md_h, &md_base); + md_offset = ((char*) result_addr - (char*) md_base); + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected++; + + ret = PtlGet(md_h, + md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + request); + sent += msg_length; + } while (sent < length); + } else { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + length *= origin_count; + + ompi_osc_portals4_get_md(result_addr, module->req_md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_osc_portals4_get_op(op, &ptl_op); + if (OMPI_SUCCESS != ret) return ret; + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + request->ops_expected++; + + ret = PtlFetchAtomic(result_md_h, + result_md_offset + sent, + origin_md_h, + origin_md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + request, + 0, + ptl_op, + ptl_dt); + sent += msg_length; + } while (sent < length); + } + if (OMPI_SUCCESS != ret) { + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_put(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "put: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Put: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + opal_atomic_add_64(&module->opcount, 1); + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= origin_count; + ompi_osc_portals4_get_md(origin_addr, module->md_h, &md_h, &md_base); + ret = PtlPut(md_h, + (ptl_size_t) ((char*) origin_addr - (char*) md_base), + length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL, + 0); + if (OMPI_SUCCESS != ret) { + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "get: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Get: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + opal_atomic_add_64(&module->opcount, 1); + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= origin_count; + ompi_osc_portals4_get_md(origin_addr, module->md_h, &md_h, &md_base); + ret = PtlGet(md_h, + (ptl_size_t) ((char*) origin_addr - (char*) md_base), + length, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL); + if (OMPI_SUCCESS != ret) { + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length, sent; + size_t offset; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dt; + ptl_handle_md_t md_h; + void *md_base; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Accumulate: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + ptl_size_t md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= origin_count; + sent = 0; + + ompi_osc_portals4_get_md(origin_addr, module->md_h, &md_h, &md_base); + md_offset = ((char*) origin_addr - (char*) md_base); + + do { + size_t msg_length = MIN(module->atomic_max, length - sent); + opal_atomic_add_64(&module->opcount, 1); + + if (MPI_REPLACE == op) { + ret = PtlPut(md_h, + md_offset + sent, + msg_length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + NULL, + 0); + } else { + ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_osc_portals4_get_op(op, &ptl_op); + if (OMPI_SUCCESS != ret) return ret; + + ret = PtlAtomic(md_h, + md_offset + sent, + msg_length, + PTL_ACK_REQ, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + NULL, + 0, + ptl_op, + ptl_dt); + } + if (OMPI_SUCCESS != ret) { + return ret; + } + sent += msg_length; + } while (sent < length); + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_get_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_dt, + int target, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length, sent; + size_t offset; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dt; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "get_accumulate: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, (unsigned long) result_addr, + result_count, result_dt->name, + target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + offset = get_displacement(module, target) * target_disp; + + if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || + !ompi_datatype_is_contiguous_memory_layout(result_dt, result_count) || + !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + opal_output(ompi_osc_base_framework.framework_output, + "MPI_Get_accumulate: transfer of non-contiguous memory is not currently supported.\n"); + return OMPI_ERR_NOT_SUPPORTED; + } else { + sent = 0; + if (MPI_REPLACE == op) { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= origin_count; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + + ret = PtlSwap(result_md_h, + result_md_offset + sent, + origin_md_h, + origin_md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + NULL, + 0, + NULL, + ptl_op, + ptl_dt); + sent += msg_length; + } while (sent < length); + } else if (MPI_NO_OP == op) { + ptl_handle_md_t md_h; + void *md_base; + ptl_size_t md_offset; + + ret = ompi_datatype_type_size(target_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= target_count; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &md_h, &md_base); + md_offset = ((char*) result_addr - (char*) md_base); + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + + ret = PtlGet(md_h, + md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + NULL); + sent += msg_length; + } while (sent < length); + } else { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ret = ompi_datatype_type_size(origin_dt, &length); + if (OMPI_SUCCESS != ret) { + return ret; + } + length *= origin_count; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_osc_portals4_get_op(op, &ptl_op); + if (OMPI_SUCCESS != ret) return ret; + + + do { + size_t msg_length = MIN(module->fetch_atomic_max, length - sent); + + opal_atomic_add_64(&module->opcount, 1); + + ret = PtlFetchAtomic(result_md_h, + result_md_offset + sent, + origin_md_h, + origin_md_offset + sent, + msg_length, + peer, + module->pt_idx, + module->match_bits, + offset + sent, + NULL, + 0, + ptl_op, + ptl_dt); + sent += msg_length; + } while (sent < length); + } + if (OMPI_SUCCESS != ret) { + return ret; + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_compare_and_swap(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_datatype_t ptl_dt; + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "compare_and_swap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, 0x%lx", + (unsigned long) origin_addr, + (unsigned long) compare_addr, + (unsigned long) result_addr, + dt->name, target, (int) target_disp, + (unsigned long) win)); + + ret = ompi_osc_portals4_get_dt(dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + offset = get_displacement(module, target) * target_disp; + + ret = ompi_datatype_type_size(dt, &length); + if (OMPI_SUCCESS != ret) return ret; + + assert(length < module->fetch_atomic_max); + + ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + opal_atomic_add_64(&module->opcount, 1); + + ret = PtlSwap(result_md_h, + result_md_offset, + origin_md_h, + origin_md_offset, + length, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL, + 0, + compare_addr, + PTL_CSWAP, + ptl_dt); + if (OMPI_SUCCESS != ret) { + return ret; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_fetch_and_op(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); + size_t length; + size_t offset; + ptl_op_t ptl_op; + ptl_datatype_t ptl_dt; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "fetch_and_op: 0x%lx, 0x%lx, %s, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, + (unsigned long) result_addr, + dt->name, target, (int) target_disp, + op->o_name, + (unsigned long) win)); + + ret = ompi_osc_portals4_get_dt(dt, &ptl_dt); + if (OMPI_SUCCESS != ret) return ret; + + offset = get_displacement(module, target) * target_disp; + + ret = ompi_datatype_type_size(dt, &length); + if (OMPI_SUCCESS != ret) return ret; + + assert(length < module->fetch_atomic_max); + + opal_atomic_add_64(&module->opcount, 1); + + if (MPI_REPLACE == op) { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + ret = PtlSwap(result_md_h, + result_md_offset, + origin_md_h, + origin_md_offset, + length, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL, + 0, + NULL, + PTL_SWAP, + ptl_dt); + } else if (MPI_NO_OP == op) { + ptl_handle_md_t md_h; + void *md_base; + ptl_size_t md_offset; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &md_h, &md_base); + md_offset = ((char*) result_addr - (char*) md_base); + + ret = PtlGet(md_h, + md_offset, + length, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL); + } else { + ptl_handle_md_t result_md_h, origin_md_h; + void *result_md_base, *origin_md_base; + ptl_size_t result_md_offset, origin_md_offset; + + ret = ompi_osc_portals4_get_op(op, &ptl_op); + if (OMPI_SUCCESS != ret) return ret; + + ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); + result_md_offset = ((char*) result_addr - (char*) result_md_base); + ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); + origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); + + ret = PtlFetchAtomic(result_md_h, + result_md_offset, + origin_md_h, + origin_md_offset, + length, + peer, + module->pt_idx, + module->match_bits, + offset, + NULL, + 0, + ptl_op, + ptl_dt); + } + if (OMPI_SUCCESS != ret) { + return ret; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/portals4/osc_portals4_component.c b/ompi/mca/osc/portals4/osc_portals4_component.c new file mode 100644 index 0000000000..0d0b03b163 --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_component.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2011-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "ompi/request/request.h" +#include "ompi/class/ompi_free_list.h" + +#include "osc_portals4.h" +#include "osc_portals4_request.h" + +static int component_open(void); +static int component_register(void); +static int component_init(bool enable_progress_threads, bool enable_mpi_threads); +static int component_finalize(void); +static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor); +static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model); + + +ompi_osc_portals4_component_t mca_osc_portals4_component = { + { /* ompi_osc_base_component_t */ + { /* ompi_base_component_t */ + OMPI_OSC_BASE_VERSION_3_0_0, + "portals4", + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + component_open, + NULL, + NULL, + component_register + }, + { /* mca_base_component_data */ + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + component_init, + component_query, + component_select, + component_finalize + } +}; + + +ompi_osc_portals4_module_t ompi_osc_portals4_module_template = { + { + NULL, /* shared_query */ + + ompi_osc_portals4_attach, + ompi_osc_portals4_detach, + ompi_osc_portals4_free, + + ompi_osc_portals4_put, + ompi_osc_portals4_get, + ompi_osc_portals4_accumulate, + ompi_osc_portals4_compare_and_swap, + ompi_osc_portals4_fetch_and_op, + ompi_osc_portals4_get_accumulate, + + ompi_osc_portals4_rput, + ompi_osc_portals4_rget, + ompi_osc_portals4_raccumulate, + ompi_osc_portals4_rget_accumulate, + + ompi_osc_portals4_fence, + + ompi_osc_portals4_start, + ompi_osc_portals4_complete, + ompi_osc_portals4_post, + ompi_osc_portals4_wait, + ompi_osc_portals4_test, + + ompi_osc_portals4_lock, + ompi_osc_portals4_unlock, + ompi_osc_portals4_lock_all, + ompi_osc_portals4_unlock_all, + + ompi_osc_portals4_sync, + ompi_osc_portals4_flush, + ompi_osc_portals4_flush_all, + ompi_osc_portals4_flush_local, + ompi_osc_portals4_flush_local_all, + + ompi_osc_portals4_set_info, + ompi_osc_portals4_get_info + } +}; + + +/* look up parameters for configuring this window. The code first + looks in the info structure passed by the user, then through mca + parameters. */ +static bool +check_config_value_bool(char *key, ompi_info_t *info) +{ + char *value_string; + int value_len, ret, flag, param; + const bool *flag_value; + bool result; + + ret = ompi_info_get_valuelen(info, key, &value_len, &flag); + if (OMPI_SUCCESS != ret) goto info_not_found; + if (flag == 0) goto info_not_found; + value_len++; + + value_string = (char*)malloc(sizeof(char) * value_len + 1); /* Should malloc 1 char for NUL-termination */ + if (NULL == value_string) goto info_not_found; + + ret = ompi_info_get(info, key, value_len, value_string, &flag); + if (OMPI_SUCCESS != ret) { + free(value_string); + goto info_not_found; + } + assert(flag != 0); + ret = ompi_info_value_to_bool(value_string, &result); + free(value_string); + if (OMPI_SUCCESS != ret) goto info_not_found; + return result; + + info_not_found: + param = mca_base_var_find("ompi", "osc", "portals4", key); + if (0 > param) return false; + + ret = mca_base_var_get_value(param, &flag_value, NULL, NULL); + if (OMPI_SUCCESS != ret) return false; + + return flag_value[0]; +} + + +static bool +check_config_value_equal(char *key, ompi_info_t *info, char *value) +{ + char *value_string; + int value_len, ret, flag, param; + const bool *flag_value; + bool result = false; + + ret = ompi_info_get_valuelen(info, key, &value_len, &flag); + if (OMPI_SUCCESS != ret) goto info_not_found; + if (flag == 0) goto info_not_found; + value_len++; + + value_string = (char*)malloc(sizeof(char) * value_len + 1); /* Should malloc 1 char for NUL-termination */ + if (NULL == value_string) goto info_not_found; + + ret = ompi_info_get(info, key, value_len, value_string, &flag); + if (OMPI_SUCCESS != ret) { + free(value_string); + goto info_not_found; + } + assert(flag != 0); + if (0 == strcmp(value_string, value)) result = true; + free(value_string); + return result; + + info_not_found: + param = mca_base_var_find("ompi", "osc", "portals4", key); + if (0 > param) return false; + + ret = mca_base_var_get_value(param, &flag_value, NULL, NULL); + if (OMPI_SUCCESS != ret) return false; + + if (0 == strcmp(value_string, value)) result = true; + + return result; +} + + +static int +progress_callback(void) +{ + int ret, count = 0; + ptl_event_t ev; + ompi_osc_portals4_request_t *req; + int32_t ops; + + while (true) { + ret = PtlEQGet(mca_osc_portals4_component.matching_eq_h, &ev); + if (PTL_OK == ret) { + goto process; + } else if (PTL_EQ_DROPPED == ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlEQGet reported dropped event", + __FILE__, __LINE__); + goto process; + } else if (PTL_EQ_EMPTY) { + return 0; + } else { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlEQGet failed: %d\n", + __FILE__, __LINE__, ret); + return 0; + } + +process: + if (ev.ni_fail_type != PTL_OK) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: event failure: %d %d", + __FILE__, __LINE__, ev.type, ev.ni_fail_type); + return 0; + } + + count++; + + if (NULL != ev.user_ptr) { + /* can't disable send events, but they don't count in ops */ + if (ev.type == PTL_EVENT_SEND) continue; + req = (ompi_osc_portals4_request_t*) ev.user_ptr; + opal_atomic_add_size_t(&req->super.req_status._ucount, ev.mlength); + ops = opal_atomic_add_32(&req->ops_committed, 1); + if (ops == req->ops_expected) { + OPAL_THREAD_LOCK(&ompi_request_lock); + ompi_request_complete(&req->super, true); + OPAL_THREAD_UNLOCK(&ompi_request_lock); + } + } + } + + return count; +} + + +static int +component_open(void) +{ + return OMPI_SUCCESS; +} + + +static int +component_register(void) +{ + bool ompi_osc_portals4_no_locks = false; + (void) mca_base_component_var_register(&mca_osc_portals4_component.super.osc_version, + "no_locks", + "Enable optimizations available only if MPI_LOCK is " + "not used. " + "Info key of same name overrides this value.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_osc_portals4_no_locks); + + return OMPI_SUCCESS; +} + + +static int +component_init(bool enable_progress_threads, bool enable_mpi_threads) +{ + int ret; + ptl_ni_limits_t actual; + + ret = PtlInit(); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlInit failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + + ret = PtlNIInit(PTL_IFACE_DEFAULT, + PTL_NI_PHYSICAL | PTL_NI_MATCHING, + PTL_PID_ANY, + NULL, + &actual, + &mca_osc_portals4_component.matching_ni_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlNIInit failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + /* BWB: FIX ME: Need to make sure our ID matches with the MTL... */ + + mca_osc_portals4_component.matching_atomic_max = actual.max_atomic_size; + mca_osc_portals4_component.matching_fetch_atomic_max = actual.max_fetch_atomic_size; + mca_osc_portals4_component.matching_atomic_ordered_size = + MAX(actual.max_waw_ordered_size, actual.max_war_ordered_size); + + ret = PtlEQAlloc(mca_osc_portals4_component.matching_ni_h, + 4096, + &mca_osc_portals4_component.matching_eq_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlEQAlloc failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + ret = PtlPTAlloc(mca_osc_portals4_component.matching_ni_h, + 0, + mca_osc_portals4_component.matching_eq_h, + 4, + &mca_osc_portals4_component.matching_pt_idx); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlPTAlloc failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + OBJ_CONSTRUCT(&mca_osc_portals4_component.requests, ompi_free_list_t); + ret = ompi_free_list_init(&mca_osc_portals4_component.requests, + sizeof(ompi_osc_portals4_request_t), + OBJ_CLASS(ompi_osc_portals4_request_t), + 8, + 0, + 8, + NULL); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: ompi_free_list_init failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + ret = opal_progress_register(progress_callback); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: opal_progress_register failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + return OMPI_SUCCESS; +} + + +static int +component_finalize(void) +{ + PtlNIFini(mca_osc_portals4_component.matching_ni_h); + + return OMPI_SUCCESS; +} + + +static int +component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor) +{ + if (MPI_WIN_FLAVOR_SHARED == flavor) return -1; + + return 20; +} + + +static int +component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model) +{ + ompi_osc_portals4_module_t *module = NULL; + int ret = OMPI_ERROR; + int tmp; + ptl_md_t md; + ptl_me_t me; + char *name; + + if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED; + + /* create module structure */ + module = (ompi_osc_portals4_module_t*) + calloc(1, sizeof(ompi_osc_portals4_module_t)); + if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + /* fill in the function pointer part */ + memcpy(module, &ompi_osc_portals4_module_template, + sizeof(ompi_osc_base_module_t)); + + /* fill in our part */ + if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { + module->free_after = *base = malloc(size); + if (NULL == *base) goto error; + } else { + module->free_after = NULL; + } + + ret = ompi_comm_dup(comm, &module->comm); + if (OMPI_SUCCESS != ret) goto error; + + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "portals4 component creating window with id %d", + ompi_comm_get_cid(module->comm)); + + asprintf(&name, "portals4 window %d", ompi_comm_get_cid(module->comm)); + ompi_win_set_name(win, name); + free(name); + + /* share everyone's displacement units. Only do an allgather if + strictly necessary, since it requires O(p) state. */ + tmp = disp_unit; + ret = module->comm->c_coll.coll_bcast(&tmp, 1, MPI_INT, 0, + module->comm, + module->comm->c_coll.coll_bcast_module); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: MPI_Bcast failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + tmp = (tmp == disp_unit) ? 1 : 0; + ret = module->comm->c_coll.coll_allreduce(MPI_IN_PLACE, &tmp, 1, MPI_INT, MPI_LAND, + module->comm, module->comm->c_coll.coll_allreduce_module); + if (OMPI_SUCCESS != ret) goto error; + if (tmp == 1) { + module->disp_unit = disp_unit; + module->disp_units = NULL; + } else { + module->disp_unit = -1; + module->disp_units = malloc(sizeof(int) * ompi_comm_size(module->comm)); + ret = module->comm->c_coll.coll_allgather(&disp_unit, 1, MPI_INT, + module->disp_units, 1, MPI_INT, + module->comm, + module->comm->c_coll.coll_allgather_module); + if (OMPI_SUCCESS != ret) goto error; + } + + module->ni_h = mca_osc_portals4_component.matching_ni_h; + module->pt_idx = mca_osc_portals4_component.matching_pt_idx; + + ret = PtlCTAlloc(module->ni_h, &(module->ct_h)); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlCTAlloc failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + { + int i; + int num_mds = ompi_mtl_portals4_get_num_mds(); + ptl_size_t size = 1ULL << OMPI_PORTALS4_MAX_MD_SIZE; + ptl_size_t offset_unit = (1ULL << OMPI_PORTALS4_MAX_MD_SIZE) / 2; + + module->md_h = malloc(sizeof(ptl_handle_md_t) * num_mds); + if (NULL == module->md_h) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + for (i = 0 ; i < num_mds ; ++i) { + module->md_h[i] = PTL_INVALID_HANDLE; + } + + module->req_md_h = malloc(sizeof(ptl_handle_md_t) * num_mds); + if (NULL == module->req_md_h) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + for (i = 0 ; i < num_mds ; ++i) { + module->req_md_h[i] = PTL_INVALID_HANDLE; + } + + for (i = 0 ; i < num_mds ; ++i) { + md.start = (char*) (offset_unit * i); + md.length = (i - 1 == num_mds) ? size / 2 : size; + + md.options = PTL_MD_EVENT_SUCCESS_DISABLE | PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; + md.eq_handle = mca_osc_portals4_component.matching_eq_h; + md.ct_handle = module->ct_h; + ret = PtlMDBind(module->ni_h, &md, &module->md_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + + md.options = PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; + md.eq_handle = mca_osc_portals4_component.matching_eq_h; + md.ct_handle = module->ct_h; + ret = PtlMDBind(module->ni_h, &md, &module->req_md_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + } +#else + md.start = 0; + md.length = PTL_SIZE_MAX; + md.options = PTL_MD_EVENT_SUCCESS_DISABLE | PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; + md.eq_handle = mca_osc_portals4_component.matching_eq_h; + md.ct_handle = module->ct_h; + ret = PtlMDBind(module->ni_h, &md, &module->md_h[0]); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + + md.start = 0; + md.length = PTL_SIZE_MAX; + md.options = PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; + md.eq_handle = mca_osc_portals4_component.matching_eq_h; + md.ct_handle = module->ct_h; + ret = PtlMDBind(module->ni_h, &md, &module->req_md_h[0]); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMDBind failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } +#endif + + if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { + me.start = 0; + me.length = SIZE_MAX; + } else { + me.start = *base; + me.length = size; + } + me.ct_handle = PTL_CT_NONE; + me.uid = PTL_UID_ANY; + me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = module->comm->c_contextid; + me.ignore_bits = 0; + + ret = PtlMEAppend(module->ni_h, + module->pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &module->data_me_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMEAppend failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + + me.start = &module->state; + me.length = sizeof(module->state); + me.ct_handle = PTL_CT_NONE; + me.uid = PTL_UID_ANY; + me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; + me.match_id.phys.nid = PTL_NID_ANY; + me.match_id.phys.pid = PTL_PID_ANY; + me.match_bits = module->comm->c_contextid | OSC_PORTALS4_MB_CONTROL; + me.ignore_bits = 0; + + ret = PtlMEAppend(module->ni_h, + module->pt_idx, + &me, + PTL_PRIORITY_LIST, + NULL, + &module->control_me_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: PtlMEAppend failed: %d\n", + __FILE__, __LINE__, ret); + goto error; + } + + module->opcount = 0; + module->match_bits = module->comm->c_contextid; + module->atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? + mca_osc_portals4_component.matching_atomic_max : + MIN(mca_osc_portals4_component.matching_atomic_max, + mca_osc_portals4_component.matching_atomic_ordered_size); + module->fetch_atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? + mca_osc_portals4_component.matching_fetch_atomic_max : + MIN(mca_osc_portals4_component.matching_fetch_atomic_max, + mca_osc_portals4_component.matching_atomic_ordered_size); + + module->zero = 0; + module->one = 1; + module->start_group = NULL; + module->post_group = NULL; + + module->state.post_count = 0; + module->state.complete_count = 0; + if (check_config_value_bool("no_locks", info)) { + module->state.lock = LOCK_ILLEGAL; + } else { + module->state.lock = LOCK_UNLOCKED; + } + + OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t); + +#if OPAL_ASSEMBLY_ARCH == OMPI_AMD64 || OPAL_ASSEMBLY_ARCH == IA32 + *model = MPI_WIN_UNIFIED; +#else + *model = MPI_WIN_SEPARATE; +#endif + + win->w_osc_module = &module->super; + + PtlAtomicSync(); + + /* Make sure that everyone's ready to receive. */ + module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + + return OMPI_SUCCESS; + + error: + /* BWB: FIX ME: This is all wrong... */ + if (0 != module->ct_h) PtlCTFree(module->ct_h); + if (0 != module->data_me_h) PtlMEUnlink(module->data_me_h); +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + /* BWB: FIX ME */ +#else + if (0 != module->req_md_h) PtlMDRelease(module->req_md_h[0]); + if (0 != module->md_h) PtlMDRelease(module->md_h[0]); +#endif + if (NULL != module->comm) ompi_comm_free(&module->comm); + if (NULL != module) free(module); + + return ret; +} + + +int +ompi_osc_portals4_attach(struct ompi_win_t *win, void *base, size_t len) +{ + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_detach(struct ompi_win_t *win, void *base) +{ + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_free(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + int ret = OMPI_SUCCESS; + + /* synchronize */ + module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + + /* cleanup */ + PtlMEUnlink(module->data_me_h); +#if OMPI_PORTALS4_MAX_MD_SIZE < OMPI_PORTALS4_MAX_VA_SIZE + /* BWB: FIX ME */ +#else + PtlMDRelease(module->md_h[0]); + PtlMDRelease(module->req_md_h[0]); +#endif + PtlCTFree(module->ct_h); + if (NULL != module->disp_units) free(module->disp_units); + ompi_comm_free(&module->comm); + if (NULL != module->free_after) free(module->free_after); + + if (!opal_list_is_empty(&module->outstanding_locks)) { + ret = MPI_ERR_RMA_SYNC; + } + OBJ_DESTRUCT(&module->outstanding_locks); + + free(module); + + return ret; +} + + +int +ompi_osc_portals4_set_info(struct ompi_win_t *win, struct ompi_info_t *info) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + /* enforce collectiveness... */ + return module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); +} + + +int +ompi_osc_portals4_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + ompi_info_t *info = OBJ_NEW(ompi_info_t); + if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + ompi_info_set(info, "no_locks", (module->state.lock == LOCK_ILLEGAL) ? "true" : "false"); + if (module->atomic_max < mca_osc_portals4_component.matching_atomic_max) { + ompi_info_set(info, "accumulate_ordering", "none"); + } else { + ompi_info_set(info, "accumulate_ordering", "rar,war,raw,waw"); + } + + *info_used = info; + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/portals4/osc_portals4_passive_target.c b/ompi/mca/osc/portals4/osc_portals4_passive_target.c new file mode 100644 index 0000000000..39d164e0c5 --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_passive_target.c @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2011-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_portals4.h" + +#include "ompi/mca/mtl/portals4/mtl_portals4_endpoint.h" + +enum locktype_t { + lock_nocheck, + lock_exclusive, + lock_shared +}; + +struct ompi_osc_portals4_outstanding_lock_t { + opal_list_item_t super; + int target; + enum locktype_t lock_type; +}; +typedef struct ompi_osc_portals4_outstanding_lock_t ompi_osc_portals4_outstanding_lock_t; +OBJ_CLASS_INSTANCE(ompi_osc_portals4_outstanding_lock_t, opal_list_item_t, + NULL, NULL); + +static inline int +lk_cas64(ompi_osc_portals4_module_t *module, + int target, + int64_t write_val, + int64_t comp_val, + int64_t *result_val) +{ + int ret; + size_t offset = offsetof(ompi_osc_portals4_node_state_t, lock); + ptl_handle_md_t result_md_h, write_md_h; + void *result_base, *write_base; + + opal_atomic_add_64(&module->opcount, 1); + + ompi_osc_portals4_get_md(result_val, module->md_h, &result_md_h, &result_base); + ompi_osc_portals4_get_md(&write_val, module->md_h, &write_md_h, &write_base); + + ret = PtlSwap(result_md_h, + (char*) result_val - (char*) result_base, + write_md_h, + (char*) write_val - (char*) write_base, + sizeof(int64_t), + ompi_osc_portals4_get_peer(module, target), + module->pt_idx, + module->match_bits | OSC_PORTALS4_MB_CONTROL, + offset, + NULL, + 0, + &comp_val, + PTL_CSWAP, + PTL_INT64_T); + if (OMPI_SUCCESS != ret) { + return ret; + } + + ret = ompi_osc_portals4_complete_all(module); + return ret; +} + + +static inline int +lk_write64(ompi_osc_portals4_module_t *module, + int target, + int64_t write_val) +{ + int ret; + size_t offset = offsetof(ompi_osc_portals4_node_state_t, lock); + ptl_handle_md_t md_h; + void *base; + + opal_atomic_add_64(&module->opcount, 1); + + ompi_osc_portals4_get_md(&write_val, module->md_h, &md_h, &base); + + ret = PtlPut(md_h, + (char*) &write_val - (char*) base, + sizeof(int64_t), + PTL_ACK_REQ, + ompi_osc_portals4_get_peer(module, target), + module->pt_idx, + module->match_bits | OSC_PORTALS4_MB_CONTROL, + offset, + NULL, + 0); + if (OMPI_SUCCESS != ret) { + return ret; + } + + ret = ompi_osc_portals4_complete_all(module); + return ret; +} + + +static inline int +lk_add64(ompi_osc_portals4_module_t *module, + int target, + int64_t write_val, + int64_t *result_val) +{ + int ret; + size_t offset = offsetof(ompi_osc_portals4_node_state_t, lock); + ptl_handle_md_t result_md_h, write_md_h; + void *result_base, *write_base; + + opal_atomic_add_64(&module->opcount, 1); + + ompi_osc_portals4_get_md(result_val, module->md_h, &result_md_h, &result_base); + ompi_osc_portals4_get_md(&write_val, module->md_h, &write_md_h, &write_base); + + ret = PtlFetchAtomic(result_md_h, + (char*) &result_val - (char*) result_base, + write_md_h, + (char*) &write_val - (char*) write_base, + sizeof(int64_t), + ompi_osc_portals4_get_peer(module, target), + module->pt_idx, + module->match_bits | OSC_PORTALS4_MB_CONTROL, + offset, + NULL, + 0, + PTL_SUM, + PTL_INT64_T); + if (OMPI_SUCCESS != ret) { + return ret; + } + + ret = ompi_osc_portals4_complete_all(module); + return ret; +} + + +static inline int +start_exclusive(ompi_osc_portals4_module_t *module, + int target) +{ + int64_t result; + int ret; + + while (true) { + ret = lk_cas64(module, target, LOCK_EXCLUSIVE, 0, &result); + if (OMPI_SUCCESS != ret) return ret; + if (LOCK_ILLEGAL == (LOCK_ILLEGAL & result)) return MPI_ERR_RMA_SYNC; + if (0 == result) break; + } + + return OMPI_SUCCESS; +} + + +static inline int +end_exclusive(ompi_osc_portals4_module_t *module, + int target) +{ + int ret; + + ret = lk_write64(module, target, LOCK_UNLOCKED); + return ret; +} + + +static inline int +start_shared(ompi_osc_portals4_module_t *module, + int target) +{ + int64_t result; + int ret; + + while (true) { + ret = lk_add64(module, target, 1, &result); + if (OMPI_SUCCESS != ret) return ret; + + if (result > (int64_t)LOCK_EXCLUSIVE) { + if (LOCK_ILLEGAL == (LOCK_ILLEGAL & result)) return MPI_ERR_RMA_SYNC; + ret = lk_add64(module, target, -1, &result); + if (OMPI_SUCCESS != ret) return ret; + } else { + break; + } + } + + return OMPI_SUCCESS; +} + + +static inline int +end_shared(ompi_osc_portals4_module_t *module, + int target) +{ + int64_t result; + int ret; + + ret = lk_add64(module, target, -1, &result); + return ret; +} + + +int +ompi_osc_portals4_lock(int lock_type, + int target, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ompi_osc_portals4_outstanding_lock_t* lock; + int ret; + + lock = OBJ_NEW(ompi_osc_portals4_outstanding_lock_t); + lock->target = target; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + if (MPI_LOCK_EXCLUSIVE == lock_type) { + lock->lock_type = lock_exclusive; + ret = start_exclusive(module, target); + } else { + lock->lock_type = lock_shared; + ret = start_shared(module, target); + } + } else { + lock->lock_type = lock_nocheck; + ret = OMPI_SUCCESS; + } + + if (OMPI_SUCCESS == ret) { + opal_list_append(&module->outstanding_locks, &lock->super); + } else { + OBJ_RELEASE(lock); + } + + return ret; +} + + +int +ompi_osc_portals4_unlock(int target, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ompi_osc_portals4_outstanding_lock_t *lock = NULL, *item; + int ret; + + OPAL_LIST_FOREACH(item, &module->outstanding_locks, + ompi_osc_portals4_outstanding_lock_t) { + if (item->target == target) { + lock = item; + break; + } + } + if (NULL != item) { + opal_list_remove_item(&module->outstanding_locks, &lock->super); + } else { + return MPI_ERR_RMA_SYNC; + } + + ret = ompi_osc_portals4_complete_all(module); + if (ret != OMPI_SUCCESS) return ret; + + if (lock->lock_type == lock_exclusive) { + ret = end_exclusive(module, target); + } else if (lock->lock_type == lock_shared) { + ret = end_shared(module, target); + } else { + ret = OMPI_SUCCESS; + } + + OBJ_RELEASE(lock); + + return ret; +} + + +int +ompi_osc_portals4_lock_all(int assert, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ompi_osc_portals4_outstanding_lock_t* lock; + int ret = OMPI_SUCCESS; + + lock = OBJ_NEW(ompi_osc_portals4_outstanding_lock_t); + lock->target = -1; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + int i, comm_size; + + lock->lock_type = lock_shared; + comm_size = ompi_comm_size(module->comm); + + for (i = 0 ; i < comm_size ; ++i) { + ret |= start_shared(module, i); + } + } else { + lock->lock_type = lock_nocheck; + ret = OMPI_SUCCESS; + } + + if (OMPI_SUCCESS == ret) { + opal_list_append(&module->outstanding_locks, &lock->super); + } else { + OBJ_RELEASE(lock); + } + + return ret; +} + + +int +ompi_osc_portals4_unlock_all(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + ompi_osc_portals4_outstanding_lock_t *lock = NULL, *item; + int ret; + + OPAL_LIST_FOREACH(item, &module->outstanding_locks, + ompi_osc_portals4_outstanding_lock_t) { + if (item->target == -1) { + lock = item; + break; + } + } + if (NULL != item) { + opal_list_remove_item(&module->outstanding_locks, &lock->super); + } else { + return MPI_ERR_RMA_SYNC; + } + + ret = ompi_osc_portals4_complete_all(module); + if (ret != OMPI_SUCCESS) return ret; + + if (lock->lock_type == lock_shared) { + int i, comm_size; + + comm_size = ompi_comm_size(module->comm); + + for (i = 0 ; i < comm_size ; ++i) { + ret |= end_shared(module, i); + } + } + + OBJ_RELEASE(lock); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_sync(struct ompi_win_t *win) +{ + /* Not sure this is strictly necessary, but why not? */ + opal_atomic_mb(); + PtlAtomicSync(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_portals4_flush(int target, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + return ompi_osc_portals4_complete_all(module); +} + + +int +ompi_osc_portals4_flush_all(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + return ompi_osc_portals4_complete_all(module); +} + + +int +ompi_osc_portals4_flush_local(int target, + struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + return ompi_osc_portals4_complete_all(module); +} + + +int +ompi_osc_portals4_flush_local_all(struct ompi_win_t *win) +{ + ompi_osc_portals4_module_t *module = + (ompi_osc_portals4_module_t*) win->w_osc_module; + + return ompi_osc_portals4_complete_all(module); +} diff --git a/ompi/mca/osc/portals4/osc_portals4_request.c b/ompi/mca/osc/portals4/osc_portals4_request.c new file mode 100644 index 0000000000..a6ba1aeb15 --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_request.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/request/request.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_portals4.h" +#include "osc_portals4_request.h" + +static int +request_cancel(struct ompi_request_t *request, int complete) +{ + return MPI_ERR_REQUEST; +} + +static int +request_free(struct ompi_request_t **ompi_req) +{ + ompi_osc_portals4_request_t *request = + (ompi_osc_portals4_request_t*) *ompi_req; + + if (true != request->super.req_complete) { + return MPI_ERR_REQUEST; + } + + OMPI_OSC_PORTALS4_REQUEST_RETURN(request); + + *ompi_req = MPI_REQUEST_NULL; + + return OMPI_SUCCESS; +} + +static +void +request_construct(ompi_osc_portals4_request_t *request) +{ + request->super.req_type = OMPI_REQUEST_WIN; + request->super.req_status._cancelled = 0; + request->super.req_free = request_free; + request->super.req_cancel = request_cancel; +} + +OBJ_CLASS_INSTANCE(ompi_osc_portals4_request_t, + ompi_request_t, + request_construct, + NULL); diff --git a/ompi/mca/osc/portals4/osc_portals4_request.h b/ompi/mca/osc/portals4/osc_portals4_request.h new file mode 100644 index 0000000000..c2a5c3b9fc --- /dev/null +++ b/ompi/mca/osc/portals4/osc_portals4_request.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_PORTALS4_REQUEST_H +#define OSC_PORTALS4_REQUEST_H + +#include "ompi/request/request.h" + +struct ompi_osc_portals4_request_t { + ompi_request_t super; + int32_t ops_expected; + volatile int32_t ops_committed; +}; +typedef struct ompi_osc_portals4_request_t ompi_osc_portals4_request_t; + +OBJ_CLASS_DECLARATION(ompi_osc_portals4_request_t); + +#define OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, req) \ + do { \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_WAIT_MT(&mca_osc_portals4_component.requests, \ + item); \ + req = (ompi_osc_portals4_request_t*) item; \ + OMPI_REQUEST_INIT(&req->super, false); \ + req->super.req_mpi_object.win = win; \ + req->super.req_complete = false; \ + req->super.req_state = OMPI_REQUEST_ACTIVE; \ + req->ops_expected = 0; \ + req->ops_committed = 0; \ + } while (0) + +#define OMPI_OSC_PORTALS4_REQUEST_RETURN(req) \ + do { \ + OMPI_REQUEST_FINI(&request->super); \ + OMPI_FREE_LIST_RETURN_MT(&mca_osc_portals4_component.requests, \ + (ompi_free_list_item_t*) req); \ + } while (0) + + +#endif diff --git a/ompi/mca/osc/pt2pt/Makefile.am b/ompi/mca/osc/pt2pt/Makefile.am deleted file mode 100644 index a65625e343..0000000000 --- a/ompi/mca/osc/pt2pt/Makefile.am +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University. -# All rights reserved. -# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. -# All rights reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -pt2pt_sources = \ - osc_pt2pt.h \ - osc_pt2pt.c \ - osc_pt2pt_buffer.h \ - osc_pt2pt_buffer.c \ - osc_pt2pt_comm.c \ - osc_pt2pt_component.c \ - osc_pt2pt_data_move.h \ - osc_pt2pt_data_move.c \ - osc_pt2pt_header.h \ - osc_pt2pt_longreq.h \ - osc_pt2pt_longreq.c \ - osc_pt2pt_replyreq.h \ - osc_pt2pt_replyreq.c \ - osc_pt2pt_sendreq.h \ - osc_pt2pt_sendreq.c \ - osc_pt2pt_sync.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_ompi_osc_pt2pt_DSO -component_noinst = -component_install = mca_osc_pt2pt.la -else -component_noinst = libmca_osc_pt2pt.la -component_install = -endif - -mcacomponentdir = $(ompilibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_osc_pt2pt_la_SOURCES = $(pt2pt_sources) -mca_osc_pt2pt_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_osc_pt2pt_la_SOURCES = $(pt2pt_sources) -libmca_osc_pt2pt_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt.c b/ompi/mca/osc/pt2pt/osc_pt2pt.c deleted file mode 100644 index 2d1ea1c0df..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt.h" -#include "osc_pt2pt_sendreq.h" - -#include "ompi/mca/osc/base/base.h" -#include "opal/threads/mutex.h" -#include "ompi/win/win.h" -#include "ompi/communicator/communicator.h" -#include "ompi/request/request.h" -#include "mpi.h" - - -int -ompi_osc_pt2pt_module_free(ompi_win_t *win) -{ - int ret = OMPI_SUCCESS; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - - opal_output_verbose(1, ompi_osc_base_framework.framework_output, - "pt2pt component destroying window with id %d", - ompi_comm_get_cid(module->p2p_comm)); - - /* finish with a barrier */ - if (ompi_group_size(win->w_group) > 1) { - ret = module->p2p_comm->c_coll.coll_barrier(module->p2p_comm, - module->p2p_comm->c_coll.coll_barrier_module); - } - - win->w_osc_module = NULL; - - OBJ_DESTRUCT(&module->p2p_unlocks_pending); - OBJ_DESTRUCT(&module->p2p_locks_pending); - OBJ_DESTRUCT(&module->p2p_copy_pending_sendreqs); - OBJ_DESTRUCT(&module->p2p_pending_sendreqs); - OBJ_DESTRUCT(&module->p2p_acc_lock); - OBJ_DESTRUCT(&module->p2p_cond); - OBJ_DESTRUCT(&module->p2p_lock); - - if (NULL != module->p2p_sc_remote_ranks) { - free(module->p2p_sc_remote_ranks); - } - if (NULL != module->p2p_sc_remote_active_ranks) { - free(module->p2p_sc_remote_active_ranks); - } - if (NULL != module->p2p_fence_coll_counts) { - free(module->p2p_fence_coll_counts); - } - if (NULL != module->p2p_copy_num_pending_sendreqs) { - free(module->p2p_copy_num_pending_sendreqs); - } - if (NULL != module->p2p_num_pending_sendreqs) { - free(module->p2p_num_pending_sendreqs); - } - if (NULL != module->p2p_comm) ompi_comm_free(&module->p2p_comm); - -#if OPAL_ENABLE_DEBUG - memset(module, 0, sizeof(ompi_osc_base_module_t)); -#endif - if (NULL != module) free(module); - - return ret; -} diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt.h b/ompi/mca/osc/pt2pt/osc_pt2pt.h deleted file mode 100644 index 48e9455d80..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt.h +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_PT2PT_H -#define OMPI_OSC_PT2PT_H - -#include "ompi_config.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_free_list.h" -#include "opal/class/opal_hash_table.h" -#include "opal/threads/threads.h" - -#include "ompi/win/win.h" -#include "ompi/communicator/communicator.h" -#include "ompi/request/request.h" -#include "ompi/mca/osc/osc.h" - -BEGIN_C_DECLS - -#define CONTROL_MSG_TAG (-200) - -struct ompi_osc_pt2pt_component_t { - /** Extend the basic osc component interface */ - ompi_osc_base_component_t super; - - /** max size of eager message */ - unsigned long long p2p_c_eager_size; - - /** free list of ompi_osc_pt2pt_sendreq_t structures */ - opal_free_list_t p2p_c_sendreqs; - /** free list of ompi_osc_pt2pt_replyreq_t structures */ - opal_free_list_t p2p_c_replyreqs; - /** free list of ompi_osc_pt2pt_longreq_t structures */ - opal_free_list_t p2p_c_longreqs; - /** free list for eager / control meessages */ - opal_free_list_t p2p_c_buffers; -}; -typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t; - - -struct ompi_osc_pt2pt_module_t { - /** Extend the basic osc module interface */ - ompi_osc_base_module_t super; - - /** lock access to data structures in the current module */ - opal_mutex_t p2p_lock; - - /** condition variable for access to current module */ - opal_condition_t p2p_cond; - - /** lock for "atomic" window updates from reductions */ - opal_mutex_t p2p_acc_lock; - - /** pointer back to window */ - ompi_win_t *p2p_win; - - /** communicator created with this window */ - ompi_communicator_t *p2p_comm; - - /** list of ompi_osc_pt2pt_sendreq_t structures, and includes all - requests for this access epoch that have not already been - started. p2p_lock must be held when modifying this field. */ - opal_list_t p2p_pending_sendreqs; - - /** list of unsigned int counters for the number of requests to a - particular rank in p2p_comm for this access epoc. p2p_lock - must be held when modifying this field */ - unsigned int *p2p_num_pending_sendreqs; - - /** For MPI_Fence synchronization, the number of messages to send - in epoch. For Start/Complete, the number of updates for this - Complete. For lock, the number of - messages waiting for completion on on the origin side. Not - protected by p2p_lock - must use atomic counter operations. */ - int32_t p2p_num_pending_out; - - /** For MPI_Fence synchronization, the number of expected incoming - messages. For Post/Wait, the number of expected updates from - complete. For lock, the number of messages on the passive side - we are waiting for. Not protected by p2p_lock - must use - atomic counter operations. */ - int32_t p2p_num_pending_in; - - /** Number of "ping" messages from the remote post group we've - received */ - int32_t p2p_num_post_msgs; - - /** Number of "count" messages from the remote complete group - we've received */ - int32_t p2p_num_complete_msgs; - - /** cyclic counter for a unique tag for long messages. Not - protected by the p2p_lock - must use create_send_tag() to - create a send tag */ - volatile int32_t p2p_tag_counter; - - opal_list_t p2p_copy_pending_sendreqs; - unsigned int *p2p_copy_num_pending_sendreqs; - - /* ********************* FENCE data ************************ */ - /* an array of ints, each containing the value - 1. */ - int *p2p_fence_coll_counts; - - /* ********************* PWSC data ************************ */ - struct ompi_group_t *p2p_pw_group; - struct ompi_group_t *p2p_sc_group; - bool *p2p_sc_remote_active_ranks; - int *p2p_sc_remote_ranks; - - /* ********************* LOCK data ************************ */ - int32_t p2p_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */ - int32_t p2p_shared_count; - opal_list_t p2p_locks_pending; - opal_list_t p2p_unlocks_pending; - int32_t p2p_lock_received_ack; -}; -typedef struct ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_t; -OMPI_MODULE_DECLSPEC extern ompi_osc_pt2pt_component_t mca_osc_pt2pt_component; - - -/** - * Helper macro for grabbing the module structure from a window instance - */ -#define P2P_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module) - -/* - * Component functions - */ - -int ompi_osc_pt2pt_component_init(bool enable_progress_threads, - bool enable_mpi_threads); - -int ompi_osc_pt2pt_component_finalize(void); - -int ompi_osc_pt2pt_component_query(struct ompi_win_t *win, - struct ompi_info_t *info, - struct ompi_communicator_t *comm); - -int ompi_osc_pt2pt_component_select(struct ompi_win_t *win, - struct ompi_info_t *info, - struct ompi_communicator_t *comm); - -/* helper function that properly sets up request handling */ -int ompi_osc_pt2pt_component_irecv(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *data); - -int ompi_osc_pt2pt_component_isend(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int dest, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *data); - -/* - * Module interface function types - */ -int ompi_osc_pt2pt_module_free(struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_put(void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_accumulate(void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, - struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_get(void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_fence(int assert, struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_start(struct ompi_group_t *group, - int assert, - struct ompi_win_t *win); -int ompi_osc_pt2pt_module_complete(struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_post(struct ompi_group_t *group, - int assert, - struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_wait(struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_test(struct ompi_win_t *win, - int *flag); - -int ompi_osc_pt2pt_module_lock(int lock_type, - int target, - int assert, - struct ompi_win_t *win); - -int ompi_osc_pt2pt_module_unlock(int target, - struct ompi_win_t *win); - -/* - * passive side sync interface functions - */ -int ompi_osc_pt2pt_passive_lock(ompi_osc_pt2pt_module_t *module, - int32_t origin, - int32_t lock_type); - -int ompi_osc_pt2pt_passive_unlock(ompi_osc_pt2pt_module_t *module, - int32_t origin, - int32_t count); - -int ompi_osc_pt2pt_passive_unlock_complete(ompi_osc_pt2pt_module_t *module); - -END_C_DECLS - -#endif /* OMPI_OSC_PT2PT_H */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.c b/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.c deleted file mode 100644 index 5b7d6cdffe..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "opal/class/opal_free_list.h" -#include "opal/types.h" - -#include "osc_pt2pt_buffer.h" - -static void ompi_osc_pt2pt_buffer_construct(ompi_osc_pt2pt_buffer_t *buf) -{ - /* adjust payload location to account for alignment issues */ - buf->payload = (void* )(((char*) buf) + - sizeof(ompi_osc_pt2pt_buffer_t) + - (sizeof(ompi_osc_pt2pt_buffer_t) % sizeof(ompi_ptr_t))); -} - - -static void ompi_osc_pt2pt_buffer_destruct(ompi_osc_pt2pt_buffer_t *buf) -{ - buf->payload = NULL; -} - - -OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_buffer_t, opal_free_list_item_t, - ompi_osc_pt2pt_buffer_construct, - ompi_osc_pt2pt_buffer_destruct); - diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.h b/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.h deleted file mode 100644 index d091d54b97..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_buffer.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2006 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_PT2PT_BUFFER_H -#define OMPI_OSC_PT2PT_BUFFER_H - -#include "opal/class/opal_free_list.h" -#include "ompi/request/request.h" - -BEGIN_C_DECLS - -struct ompi_osc_pt2pt_buffer_t { - ompi_free_list_item_t super; - - ompi_request_t *request; - void *data; - void *payload; - size_t len; -}; -typedef struct ompi_osc_pt2pt_buffer_t ompi_osc_pt2pt_buffer_t; -OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_buffer_t); - -END_C_DECLS - -#endif diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c b/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c deleted file mode 100644 index 059dbab4f1..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_comm.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" -#include "mpi.h" - -#include - -#include "opal/class/opal_list.h" -#include "opal/threads/mutex.h" -#include "osc_pt2pt.h" -#include "osc_pt2pt_sendreq.h" -#include "osc_pt2pt_header.h" -#include "osc_pt2pt_data_move.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/op/op.h" -#include "ompi/win/win.h" -#include "ompi/memchecker.h" - -static int -enqueue_sendreq(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t *sendreq) -{ - OPAL_THREAD_LOCK(&(module->p2p_lock)); - opal_list_append(&(module->p2p_pending_sendreqs), - (opal_list_item_t*) sendreq); - module->p2p_num_pending_sendreqs[sendreq->req_target_rank]++; - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - return OMPI_SUCCESS; -} - - - -int -ompi_osc_pt2pt_module_accumulate(void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, ompi_win_t *win) -{ - int ret; - ompi_osc_pt2pt_sendreq_t *sendreq; - - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!P2P_MODULE(win)->p2p_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_pt2pt_sendreq_alloc_init(OMPI_OSC_PT2PT_ACC, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - P2P_MODULE(win), - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - sendreq->req_op_id = op->o_f_to_c_index; - - /* enqueue sendreq */ - ret = enqueue_sendreq(P2P_MODULE(win), sendreq); - - return ret; -} - - -int -ompi_osc_pt2pt_module_get(void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - ompi_win_t *win) -{ - int ret; - ompi_osc_pt2pt_sendreq_t *sendreq; - - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!P2P_MODULE(win)->p2p_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_pt2pt_sendreq_alloc_init(OMPI_OSC_PT2PT_GET, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - P2P_MODULE(win), - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - /* enqueue sendreq */ - ret = enqueue_sendreq(P2P_MODULE(win), sendreq); - - return ret; -} - - -int -ompi_osc_pt2pt_module_put(void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, ompi_win_t *win) -{ - int ret; - ompi_osc_pt2pt_sendreq_t *sendreq; - - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!P2P_MODULE(win)->p2p_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_pt2pt_sendreq_alloc_init(OMPI_OSC_PT2PT_PUT, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - P2P_MODULE(win), - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - /* enqueue sendreq */ - ret = enqueue_sendreq(P2P_MODULE(win), sendreq); - - return ret; -} diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_component.c b/ompi/mca/osc/pt2pt/osc_pt2pt_component.c deleted file mode 100644 index 32b7e07a85..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_component.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2010 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2006-2008 University of Houston. All rights reserved. - * Copyright (c) 2010 Sandia National Laboratories. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include - -#include "osc_pt2pt.h" -#include "osc_pt2pt_sendreq.h" -#include "osc_pt2pt_replyreq.h" -#include "osc_pt2pt_header.h" -#include "osc_pt2pt_data_move.h" -#include "osc_pt2pt_buffer.h" - -#include "opal/threads/mutex.h" - -#include "ompi/info/info.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/osc/osc.h" -#include "ompi/mca/osc/base/base.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" -#include "ompi/mca/pml/pml.h" - -static int component_register(void); -static int component_fragment_cb(ompi_request_t *request); - -ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = { - { /* ompi_osc_base_component_t */ - { /* ompi_base_component_t */ - OMPI_OSC_BASE_VERSION_2_0_0, - "pt2pt", - OMPI_MAJOR_VERSION, /* MCA component major version */ - OMPI_MINOR_VERSION, /* MCA component minor version */ - OMPI_RELEASE_VERSION, /* MCA component release version */ - NULL, - NULL, - NULL, - component_register - }, - { /* mca_base_component_data */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - ompi_osc_pt2pt_component_init, - ompi_osc_pt2pt_component_query, - ompi_osc_pt2pt_component_select, - ompi_osc_pt2pt_component_finalize - } -}; - - -ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_template = { - { - ompi_osc_pt2pt_module_free, - - ompi_osc_pt2pt_module_put, - ompi_osc_pt2pt_module_get, - ompi_osc_pt2pt_module_accumulate, - - ompi_osc_pt2pt_module_fence, - - ompi_osc_pt2pt_module_start, - ompi_osc_pt2pt_module_complete, - ompi_osc_pt2pt_module_post, - ompi_osc_pt2pt_module_wait, - ompi_osc_pt2pt_module_test, - - ompi_osc_pt2pt_module_lock, - ompi_osc_pt2pt_module_unlock, - } -}; - - -static int -component_register(void) -{ - mca_osc_pt2pt_component.p2p_c_eager_size = 16 * 1024; - (void) mca_base_component_var_register(&mca_osc_pt2pt_component.super.osc_version, - "eager_limit", - "Max size of eagerly sent data", - MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_osc_pt2pt_component.p2p_c_eager_size); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_component_init(bool enable_progress_threads, - bool enable_mpi_threads) -{ - size_t aligned_size; - - OBJ_CONSTRUCT(&mca_osc_pt2pt_component.p2p_c_sendreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_pt2pt_component.p2p_c_sendreqs, - sizeof(ompi_osc_pt2pt_sendreq_t), - OBJ_CLASS(ompi_osc_pt2pt_sendreq_t), - 1, -1, 1); - - OBJ_CONSTRUCT(&mca_osc_pt2pt_component.p2p_c_replyreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_pt2pt_component.p2p_c_replyreqs, - sizeof(ompi_osc_pt2pt_replyreq_t), - OBJ_CLASS(ompi_osc_pt2pt_replyreq_t), - 1, -1, 1); - - OBJ_CONSTRUCT(&mca_osc_pt2pt_component.p2p_c_longreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_pt2pt_component.p2p_c_longreqs, - sizeof(ompi_osc_pt2pt_longreq_t), - OBJ_CLASS(ompi_osc_pt2pt_longreq_t), - 1, -1, 1); - - /* adjust size to be multiple of ompi_ptr_t to avoid alignment issues*/ - aligned_size = sizeof(ompi_osc_pt2pt_buffer_t) + - (sizeof(ompi_osc_pt2pt_buffer_t) % sizeof(ompi_ptr_t)) + - mca_osc_pt2pt_component.p2p_c_eager_size; - OBJ_CONSTRUCT(&mca_osc_pt2pt_component.p2p_c_buffers, opal_free_list_t); - opal_free_list_init(&mca_osc_pt2pt_component.p2p_c_buffers, - aligned_size, - OBJ_CLASS(ompi_osc_pt2pt_buffer_t), - 1, -1, 1); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_component_finalize(void) -{ - OBJ_DESTRUCT(&mca_osc_pt2pt_component.p2p_c_buffers); - OBJ_DESTRUCT(&mca_osc_pt2pt_component.p2p_c_longreqs); - OBJ_DESTRUCT(&mca_osc_pt2pt_component.p2p_c_replyreqs); - OBJ_DESTRUCT(&mca_osc_pt2pt_component.p2p_c_sendreqs); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_component_query(ompi_win_t *win, - ompi_info_t *info, - ompi_communicator_t *comm) -{ - /* we can always run - return a low priority */ - return 5; -} - - -int -ompi_osc_pt2pt_component_select(ompi_win_t *win, - ompi_info_t *info, - ompi_communicator_t *comm) -{ - ompi_osc_pt2pt_module_t *module = NULL; - int ret, i; - ompi_osc_pt2pt_buffer_t *buffer = NULL; - opal_free_list_item_t *item = NULL; - char *tmp = NULL; - - /* create module structure */ - module = (ompi_osc_pt2pt_module_t*) - calloc(1, sizeof(ompi_osc_pt2pt_module_t)); - if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - - /* fill in the function pointer part */ - memcpy(module, &ompi_osc_pt2pt_module_template, - sizeof(ompi_osc_base_module_t)); - - /* initialize the p2p part */ - OBJ_CONSTRUCT(&(module->p2p_lock), opal_mutex_t); - OBJ_CONSTRUCT(&(module->p2p_cond), opal_condition_t); - OBJ_CONSTRUCT(&(module->p2p_acc_lock), opal_mutex_t); - OBJ_CONSTRUCT(&module->p2p_pending_sendreqs, opal_list_t); - OBJ_CONSTRUCT(&(module->p2p_copy_pending_sendreqs), opal_list_t); - OBJ_CONSTRUCT(&(module->p2p_locks_pending), opal_list_t); - OBJ_CONSTRUCT(&(module->p2p_unlocks_pending), opal_list_t); - - module->p2p_win = win; - - ret = ompi_comm_dup(comm, &(module->p2p_comm)); - if (ret != OMPI_SUCCESS) goto cleanup; - - opal_output_verbose(1, ompi_osc_base_framework.framework_output, - "pt2pt component creating window with id %d", - ompi_comm_get_cid(module->p2p_comm)); - - asprintf(&tmp, "%d", ompi_comm_get_cid(module->p2p_comm)); - ompi_win_set_name(win, tmp); - free(tmp); - - module->p2p_num_pending_sendreqs = (unsigned int*) - malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); - if (NULL == module->p2p_num_pending_sendreqs) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - memset(module->p2p_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); - - module->p2p_num_pending_out = 0; - module->p2p_num_pending_in = 0; - module->p2p_num_post_msgs = 0; - module->p2p_num_complete_msgs = 0; - module->p2p_tag_counter = 0; - - module->p2p_copy_num_pending_sendreqs = (unsigned int*) - malloc(sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); - if (NULL == module->p2p_copy_num_pending_sendreqs) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - memset(module->p2p_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); - - /* fence data */ - module->p2p_fence_coll_counts = (int*) - malloc(sizeof(int) * ompi_comm_size(module->p2p_comm)); - if (NULL == module->p2p_fence_coll_counts) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - for (i = 0 ; i < ompi_comm_size(module->p2p_comm) ; ++i) { - module->p2p_fence_coll_counts[i] = 1; - } - - /* pwsc data */ - module->p2p_pw_group = NULL; - module->p2p_sc_group = NULL; - module->p2p_sc_remote_active_ranks = (bool*) - malloc(sizeof(bool) * ompi_comm_size(module->p2p_comm)); - if (NULL == module->p2p_sc_remote_active_ranks) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - module->p2p_sc_remote_ranks = (int*) - malloc(sizeof(int) * ompi_comm_size(module->p2p_comm)); - if (NULL == module->p2p_sc_remote_ranks) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - /* lock data */ - module->p2p_lock_status = 0; - module->p2p_shared_count = 0; - module->p2p_lock_received_ack = 0; - - /* fill in window information */ - win->w_osc_module = (ompi_osc_base_module_t*) module; - - /* sync memory - make sure all initialization completed */ - opal_atomic_mb(); - - /* start up receive for protocol headers */ - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, - item, ret); - if (OMPI_SUCCESS != ret) goto cleanup; - buffer = (ompi_osc_pt2pt_buffer_t*) item; - buffer->data = (void*) module; - - ret = ompi_osc_pt2pt_component_irecv(buffer->payload, - mca_osc_pt2pt_component.p2p_c_eager_size, - MPI_BYTE, - MPI_ANY_SOURCE, - CONTROL_MSG_TAG, - module->p2p_comm, - &(buffer->request), - component_fragment_cb, - buffer); - if (OMPI_SUCCESS != ret) goto cleanup; - - return OMPI_SUCCESS; - - cleanup: - OBJ_DESTRUCT(&module->p2p_unlocks_pending); - OBJ_DESTRUCT(&module->p2p_locks_pending); - OBJ_DESTRUCT(&module->p2p_copy_pending_sendreqs); - OBJ_DESTRUCT(&module->p2p_pending_sendreqs); - OBJ_DESTRUCT(&module->p2p_acc_lock); - OBJ_DESTRUCT(&module->p2p_cond); - OBJ_DESTRUCT(&module->p2p_lock); - - if (NULL != buffer) { - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); - } - if (NULL != module->p2p_sc_remote_ranks) { - free(module->p2p_sc_remote_ranks); - } - if (NULL != module->p2p_sc_remote_active_ranks) { - free(module->p2p_sc_remote_active_ranks); - } - if (NULL != module->p2p_fence_coll_counts) { - free(module->p2p_fence_coll_counts); - } - if (NULL != module->p2p_copy_num_pending_sendreqs) { - free(module->p2p_copy_num_pending_sendreqs); - } - if (NULL != module->p2p_num_pending_sendreqs) { - free(module->p2p_num_pending_sendreqs); - } - if (NULL != module->p2p_comm) ompi_comm_free(&module->p2p_comm); - -#if OPAL_ENABLE_DEBUG - memset(module, 0, sizeof(ompi_osc_base_module_t)); -#endif - if (NULL != module) free(module); - - return ret; -} - - -/* dispatch for callback on message completion */ -static int -component_fragment_cb(ompi_request_t *request) -{ - int ret; - ompi_osc_pt2pt_buffer_t *buffer; - ompi_osc_pt2pt_module_t *module; - - if (request->req_status._cancelled) { - opal_output_verbose(5, ompi_osc_base_framework.framework_output, - "pt2pt request was canceled"); - return OMPI_ERR_NOT_AVAILABLE; - } - - buffer = (ompi_osc_pt2pt_buffer_t*) request->req_complete_cb_data; - module = (ompi_osc_pt2pt_module_t*) buffer->data; - - assert(request->req_status._ucount >= (int) sizeof(ompi_osc_pt2pt_base_header_t)); - - /* handle message */ - switch (((ompi_osc_pt2pt_base_header_t*) buffer->payload)->hdr_type) { - case OMPI_OSC_PT2PT_HDR_PUT: - { - /* get our header and payload */ - ompi_osc_pt2pt_send_header_t *header = - (ompi_osc_pt2pt_send_header_t*) buffer->payload; - void *payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_SEND_HDR_NTOH(*header); - } -#endif - - if (!ompi_win_exposure_epoch(module->p2p_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->p2p_win)) { - ompi_win_set_mode(module->p2p_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - ret = ompi_osc_pt2pt_sendreq_recv_put(module, header, payload); - } - break; - - case OMPI_OSC_PT2PT_HDR_ACC: - { - /* get our header and payload */ - ompi_osc_pt2pt_send_header_t *header = - (ompi_osc_pt2pt_send_header_t*) buffer->payload; - void *payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_SEND_HDR_NTOH(*header); - } -#endif - - if (!ompi_win_exposure_epoch(module->p2p_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->p2p_win)) { - ompi_win_set_mode(module->p2p_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - /* receive into temporary buffer */ - ret = ompi_osc_pt2pt_sendreq_recv_accum(module, header, payload); - } - break; - - case OMPI_OSC_PT2PT_HDR_GET: - { - /* get our header and payload */ - ompi_osc_pt2pt_send_header_t *header = - (ompi_osc_pt2pt_send_header_t*) buffer->payload; - void *payload = (void*) (header + 1); - ompi_datatype_t *datatype; - ompi_osc_pt2pt_replyreq_t *replyreq; - ompi_proc_t *proc; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_SEND_HDR_NTOH(*header); - } -#endif - - if (!ompi_win_exposure_epoch(module->p2p_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->p2p_win)) { - ompi_win_set_mode(module->p2p_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - /* create or get a pointer to our datatype */ - proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); - datatype = ompi_osc_base_datatype_create(proc, &payload); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->p2p_comm, 1, false); - } - - /* create replyreq sendreq */ - ret = ompi_osc_pt2pt_replyreq_alloc_init(module, - header->hdr_origin, - header->hdr_origin_sendreq, - header->hdr_target_disp, - header->hdr_target_count, - datatype, - &replyreq); - - /* send replyreq */ - ompi_osc_pt2pt_replyreq_send(module, replyreq); - - /* sendreq does the right retain, so we can release safely */ - OBJ_RELEASE(datatype); - } - break; - - case OMPI_OSC_PT2PT_HDR_REPLY: - { - ompi_osc_pt2pt_reply_header_t *header = - (ompi_osc_pt2pt_reply_header_t*) buffer->payload; - void *payload = (void*) (header + 1); - ompi_osc_pt2pt_sendreq_t *sendreq; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_REPLY_HDR_NTOH(*header); - } -#endif - - /* get original sendreq pointer */ - sendreq = (ompi_osc_pt2pt_sendreq_t*) header->hdr_origin_sendreq.pval; - module = sendreq->req_module; - - /* receive data */ - ompi_osc_pt2pt_replyreq_recv(module, sendreq, header, payload); - } - break; - - case OMPI_OSC_PT2PT_HDR_POST: - { - int32_t count; - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (module->p2p_num_post_msgs -= 1); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - if (count == 0) opal_condition_broadcast(&module->p2p_cond); - } - break; - - case OMPI_OSC_PT2PT_HDR_COMPLETE: - { - ompi_osc_pt2pt_control_header_t *header = - (ompi_osc_pt2pt_control_header_t*) buffer->payload; - int32_t count; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_CONTROL_HDR_NTOH(*header); - } -#endif - - /* we've heard from one more place, and have value reqs to - process */ - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (module->p2p_num_complete_msgs -= 1); - count += (module->p2p_num_pending_in += header->hdr_value[0]); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - if (count == 0) opal_condition_broadcast(&module->p2p_cond); - } - break; - - case OMPI_OSC_PT2PT_HDR_LOCK_REQ: - { - ompi_osc_pt2pt_control_header_t *header = - (ompi_osc_pt2pt_control_header_t*) buffer->payload; - int32_t count; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_CONTROL_HDR_NTOH(*header); - } -#endif - - if (header->hdr_value[1] > 0) { - ompi_osc_pt2pt_passive_lock(module, header->hdr_value[0], - header->hdr_value[1]); - } else { - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (module->p2p_lock_received_ack += 1); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - if (count != 0) opal_condition_broadcast(&module->p2p_cond); - } - } - break; - - case OMPI_OSC_PT2PT_HDR_UNLOCK_REQ: - { - ompi_osc_pt2pt_control_header_t *header = - (ompi_osc_pt2pt_control_header_t*) buffer->payload; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_CONTROL_HDR_NTOH(*header); - } -#endif - - ompi_osc_pt2pt_passive_unlock(module, header->hdr_value[0], - header->hdr_value[1]); - } - break; - - case OMPI_OSC_PT2PT_HDR_UNLOCK_REPLY: - { - int32_t count; - - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (module->p2p_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - if (count == 0) opal_condition_broadcast(&module->p2p_cond); - } - break; - - default: - opal_output_verbose(5, ompi_osc_base_framework.framework_output, - "received one-sided packet for with unknown type"); - } - - ompi_request_free(&request); - ret = ompi_osc_pt2pt_component_irecv(buffer->payload, - mca_osc_pt2pt_component.p2p_c_eager_size, - MPI_BYTE, - MPI_ANY_SOURCE, - CONTROL_MSG_TAG, - module->p2p_comm, - &buffer->request, - component_fragment_cb, - buffer); - - return ret; -} - - -int -ompi_osc_pt2pt_component_irecv(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *cbdata) -{ - int ret; - bool missed_callback; - ompi_request_complete_fn_t tmp; - - ret = MCA_PML_CALL(irecv(buf, count, datatype, - src, tag, comm, request)); - if (OMPI_SUCCESS != ret) return ret; - - /* lock the giant request mutex to update the callback data so - that the PML can't mark the request as complete while we're - updating the callback data, which means we can - deterministically ensure the callback is only fired once and - that we didn't miss it. */ - OPAL_THREAD_LOCK(&ompi_request_lock); - (*request)->req_complete_cb = callback; - (*request)->req_complete_cb_data = cbdata; - missed_callback = (*request)->req_complete; - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - if (missed_callback) { - tmp = (*request)->req_complete_cb; - (*request)->req_complete_cb = NULL; - tmp(*request); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_component_isend(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int dest, - int tag, - struct ompi_communicator_t *comm, - ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *cbdata) -{ - int ret; - bool missed_callback; - ompi_request_complete_fn_t tmp; - - ret = MCA_PML_CALL(isend(buf, count, datatype, - dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request)); - if (OMPI_SUCCESS != ret) return ret; - - /* lock the giant request mutex to update the callback data so - that the PML can't mark the request as complete while we're - updating the callback data, which means we can - deterministically ensure the callback is only fired once and - that we didn't miss it. */ - OPAL_THREAD_LOCK(&ompi_request_lock); - (*request)->req_complete_cb = callback; - (*request)->req_complete_cb_data = cbdata; - missed_callback = (*request)->req_complete; - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - if (missed_callback) { - tmp = (*request)->req_complete_cb; - (*request)->req_complete_cb = NULL; - tmp(*request); - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c deleted file mode 100644 index e4f1e9e0bf..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c +++ /dev/null @@ -1,1095 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt.h" -#include "osc_pt2pt_sendreq.h" -#include "osc_pt2pt_header.h" -#include "osc_pt2pt_data_move.h" -#include "osc_pt2pt_buffer.h" - -#include "opal/util/arch.h" -#include "opal/util/output.h" -#include "opal/sys/atomic.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/op/op.h" -#include "ompi/mca/osc/base/base.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" -#include "ompi/memchecker.h" - - -static inline int32_t -create_send_tag(ompi_osc_pt2pt_module_t *module) -{ -#if OPAL_ENABLE_MULTI_THREADS && OPAL_HAVE_ATOMIC_CMPSET_32 - int32_t newval, oldval; - do { - oldval = module->p2p_tag_counter; - newval = (oldval + 1) % mca_pml.pml_max_tag; - } while (0 == opal_atomic_cmpset_32(&module->p2p_tag_counter, oldval, newval)); - return newval; -#else - int32_t ret; - /* no compare and swap - have to lock the module */ - OPAL_THREAD_LOCK(&module->p2p_lock); - module->p2p_tag_counter = (module->p2p_tag_counter + 1) % mca_pml.pml_max_tag; - ret = module->p2p_tag_counter; - OPAL_THREAD_UNLOCK(&module->p2p_lock); - return ret; -#endif -} - - -static inline void -inmsg_mark_complete(ompi_osc_pt2pt_module_t *module) -{ - int32_t count; - bool need_unlock = false; - - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (module->p2p_num_pending_in -= 1); - if ((0 != module->p2p_lock_status) && - (opal_list_get_size(&module->p2p_unlocks_pending) != 0)) { - need_unlock = true; - } - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - MEMCHECKER( - /* Here we need restore the initial states of memory. */ - opal_memchecker_base_mem_defined( module->p2p_win->w_baseptr, module->p2p_win->w_size); - ); - if (0 == count) { - if (need_unlock) ompi_osc_pt2pt_passive_unlock_complete(module); - opal_condition_broadcast(&module->p2p_cond); - } -} - - -/********************************************************************** - * - * Sending a sendreq to target - * - **********************************************************************/ -static int -ompi_osc_pt2pt_sendreq_send_long_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_longreq_t *longreq = - (ompi_osc_pt2pt_longreq_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_sendreq_t *sendreq = longreq->req_basereq.req_sendreq; - int32_t count; - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d completed long sendreq to %d", - ompi_comm_rank(sendreq->req_module->p2p_comm), - sendreq->req_target_rank)); - - OPAL_THREAD_LOCK(&sendreq->req_module->p2p_lock); - count = (sendreq->req_module->p2p_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->p2p_lock); - - ompi_osc_pt2pt_longreq_free(longreq); - ompi_osc_pt2pt_sendreq_free(sendreq); - - if (0 == count) opal_condition_broadcast(&sendreq->req_module->p2p_cond); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -static int -ompi_osc_pt2pt_sendreq_send_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_buffer_t *buffer = - (ompi_osc_pt2pt_buffer_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_sendreq_t *sendreq = - (ompi_osc_pt2pt_sendreq_t*) buffer->data; - ompi_osc_pt2pt_send_header_t *header = - (ompi_osc_pt2pt_send_header_t*) buffer->payload; - int32_t count; - - /* have to look at header, and not the sendreq because in the case - of get, it's possible that the sendreq has been freed already - (if the remote side replies before we get our send completion - callback) and already allocated to another request. We don't - wait for this completion before exiting a synchronization point - in the case of get, as we really don't care when it completes - - only when the data arrives. */ - if (OMPI_OSC_PT2PT_HDR_GET != header->hdr_base.hdr_type) { -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_SEND_HDR_NTOH(*header); - } -#endif - - if (header->hdr_msg_length != 0) { - /* sendreq is done. Mark it as so and get out of here */ - OPAL_THREAD_LOCK(&sendreq->req_module->p2p_lock); - count = (sendreq->req_module->p2p_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->p2p_lock); - ompi_osc_pt2pt_sendreq_free(sendreq); - if (0 == count) opal_condition_broadcast(&sendreq->req_module->p2p_cond); - } - } - - /* release the buffer */ - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, buffer); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -/* create the initial fragment, pack header, datatype, and payload (if - size fits) and send */ -int -ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t *sendreq) -{ - int ret = OMPI_SUCCESS; - opal_free_list_item_t *item = NULL; - ompi_osc_pt2pt_send_header_t *header = NULL; - ompi_osc_pt2pt_buffer_t *buffer = NULL; - size_t written_data = 0; - size_t needed_len = sizeof(ompi_osc_pt2pt_send_header_t); - const void *packed_ddt; - size_t packed_ddt_len = ompi_datatype_pack_description_length(sendreq->req_target_datatype); - - /* we always need to send the ddt */ - needed_len += packed_ddt_len; - if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { - needed_len += sendreq->req_origin_bytes_packed; - } - - /* verify at least enough space for header */ - if (mca_osc_pt2pt_component.p2p_c_eager_size - < sizeof(ompi_osc_pt2pt_send_header_t) + packed_ddt_len) { - ret = MPI_ERR_TRUNCATE; - goto cleanup; - } - - /* Get a buffer */ - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, - item, ret); - if (NULL == item) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - buffer = (ompi_osc_pt2pt_buffer_t*) item; - - /* setup buffer */ - buffer->data = sendreq; - - /* pack header */ - header = (ompi_osc_pt2pt_send_header_t*) buffer->payload; - written_data += sizeof(ompi_osc_pt2pt_send_header_t); - header->hdr_base.hdr_flags = 0; - header->hdr_origin = ompi_comm_rank(sendreq->req_module->p2p_comm); - header->hdr_origin_sendreq.pval = (void*) sendreq; - header->hdr_origin_tag = 0; - header->hdr_target_disp = sendreq->req_target_disp; - header->hdr_target_count = sendreq->req_target_count; - - switch (sendreq->req_type) { - case OMPI_OSC_PT2PT_PUT: - header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_PUT; -#if OPAL_ENABLE_MEM_DEBUG - header->hdr_target_op = 0; -#endif - break; - - case OMPI_OSC_PT2PT_ACC: - header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_ACC; - header->hdr_target_op = sendreq->req_op_id; - break; - - case OMPI_OSC_PT2PT_GET: - header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_GET; -#if OPAL_ENABLE_MEM_DEBUG - header->hdr_target_op = 0; -#endif - break; - } - - /* Set datatype id and / or pack datatype */ - ret = ompi_datatype_get_pack_description(sendreq->req_target_datatype, &packed_ddt); - if (OMPI_SUCCESS != ret) goto cleanup; - memcpy((unsigned char*) buffer->payload + written_data, - packed_ddt, packed_ddt_len); - written_data += packed_ddt_len; - - if (OMPI_OSC_PT2PT_GET != sendreq->req_type) { - /* if sending data and it fits, pack payload */ - if (mca_osc_pt2pt_component.p2p_c_eager_size >= - written_data + sendreq->req_origin_bytes_packed) { - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = sendreq->req_origin_bytes_packed; - - iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &sendreq->req_origin_convertor); - ); - ret = opal_convertor_pack(&sendreq->req_origin_convertor, &iov, &iov_count, - &max_data ); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (ret < 0) { - ret = OMPI_ERR_FATAL; - goto cleanup; - } - - assert(max_data == sendreq->req_origin_bytes_packed); - written_data += max_data; - - header->hdr_msg_length = sendreq->req_origin_bytes_packed; - } else { - header->hdr_msg_length = 0; - header->hdr_origin_tag = create_send_tag(module); - } - } else { - header->hdr_msg_length = 0; - } - - buffer->len = written_data; - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (sendreq->req_target_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; - OMPI_OSC_PT2PT_SEND_HDR_HTON(*header); - } -#endif - - /* send fragment */ - OPAL_OUTPUT_VERBOSE((51, ompi_osc_base_framework.framework_output, - "%d sending sendreq to %d", - ompi_comm_rank(sendreq->req_module->p2p_comm), - sendreq->req_target_rank)); - ret = ompi_osc_pt2pt_component_isend(buffer->payload, - buffer->len, - MPI_BYTE, - sendreq->req_target_rank, - CONTROL_MSG_TAG, - module->p2p_comm, - &buffer->request, - ompi_osc_pt2pt_sendreq_send_cb, - buffer); - - MEMCHECKER( - opal_memchecker_base_mem_defined(buffer->payload, buffer->len); - ); - if (OMPI_OSC_PT2PT_GET != sendreq->req_type && - header->hdr_msg_length == 0) { - ompi_osc_pt2pt_longreq_t *longreq; - ompi_osc_pt2pt_longreq_alloc(&longreq); - longreq->req_basereq.req_sendreq = sendreq; - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d starting long sendreq to %d (%d)", - ompi_comm_rank(sendreq->req_module->p2p_comm), - sendreq->req_target_rank, - header->hdr_origin_tag)); - - ret = ompi_osc_pt2pt_component_isend(sendreq->req_origin_convertor.pBaseBuf, - sendreq->req_origin_convertor.count, - sendreq->req_origin_datatype, - sendreq->req_target_rank, - header->hdr_origin_tag, - sendreq->req_module->p2p_comm, - &(longreq->req_pml_request), - ompi_osc_pt2pt_sendreq_send_long_cb, - longreq); - } - - goto done; - - cleanup: - if (item != NULL) { - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, - item); - } - - done: - return ret; -} - - -/********************************************************************** - * - * Sending a replyreq back to origin - * - **********************************************************************/ -static int -ompi_osc_pt2pt_replyreq_send_long_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_longreq_t *longreq = - (ompi_osc_pt2pt_longreq_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_replyreq_t *replyreq = longreq->req_basereq.req_replyreq; - - inmsg_mark_complete(replyreq->rep_module); - - ompi_osc_pt2pt_longreq_free(longreq); - ompi_osc_pt2pt_replyreq_free(replyreq); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -static int -ompi_osc_pt2pt_replyreq_send_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_buffer_t *buffer = - (ompi_osc_pt2pt_buffer_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_replyreq_t *replyreq = - (ompi_osc_pt2pt_replyreq_t*) buffer->data; - ompi_osc_pt2pt_reply_header_t *header = - (ompi_osc_pt2pt_reply_header_t*) buffer->payload; - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_PT2PT_HDR_FLAG_NBO) { - OMPI_OSC_PT2PT_REPLY_HDR_NTOH(*header); - } -#endif - - /* do we need to post a send? */ - if (header->hdr_msg_length != 0) { - /* sendreq is done. Mark it as so and get out of here */ - inmsg_mark_complete(replyreq->rep_module); - ompi_osc_pt2pt_replyreq_free(replyreq); - } - - /* release the descriptor and replyreq */ - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, buffer); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_replyreq_send(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_replyreq_t *replyreq) -{ - int ret = OMPI_SUCCESS; - opal_free_list_item_t *item; - ompi_osc_pt2pt_buffer_t *buffer = NULL; - ompi_osc_pt2pt_reply_header_t *header = NULL; - size_t written_data = 0; - - /* Get a buffer */ - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, - item, ret); - if (NULL == item) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - buffer = (ompi_osc_pt2pt_buffer_t*) item; - - /* verify at least enough space for header */ - if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_reply_header_t)) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - - /* setup buffer */ - buffer->data = replyreq; - - /* pack header */ - header = (ompi_osc_pt2pt_reply_header_t*) buffer->payload; - written_data += sizeof(ompi_osc_pt2pt_reply_header_t); - header->hdr_base.hdr_type = OMPI_OSC_PT2PT_HDR_REPLY; - header->hdr_base.hdr_flags = 0; - header->hdr_origin_sendreq = replyreq->rep_origin_sendreq; - header->hdr_target_tag = 0; - - /* if sending data fits, pack payload */ - if (mca_osc_pt2pt_component.p2p_c_eager_size >= - written_data + replyreq->rep_target_bytes_packed) { - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = replyreq->rep_target_bytes_packed; - - iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) buffer->payload + written_data); - /* - * Before copy to the target buffer, make the target part - * accessable. - */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &replyreq->rep_target_convertor); - ); - ret = opal_convertor_pack(&replyreq->rep_target_convertor, &iov, &iov_count, - &max_data ); - /* Copy finished, make the target buffer unaccessable. */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &replyreq->rep_target_convertor); - ); - - if (ret < 0) { - ret = OMPI_ERR_FATAL; - goto cleanup; - } - - assert(max_data == replyreq->rep_target_bytes_packed); - written_data += max_data; - - header->hdr_msg_length = replyreq->rep_target_bytes_packed; - } else { - header->hdr_msg_length = 0; - header->hdr_target_tag = create_send_tag(module); - } - - buffer->len = written_data; - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (replyreq->rep_origin_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; - OMPI_OSC_PT2PT_REPLY_HDR_HTON(*header); - } -#endif - - /* send fragment */ - ret = ompi_osc_pt2pt_component_isend(buffer->payload, - buffer->len, - MPI_BYTE, - replyreq->rep_origin_rank, - CONTROL_MSG_TAG, - module->p2p_comm, - &buffer->request, - ompi_osc_pt2pt_replyreq_send_cb, - buffer); - - /* Need to be fixed. - * The payload is made undefined due to the isend call. - */ - MEMCHECKER( - opal_memchecker_base_mem_defined(buffer->payload, buffer->len); - ); - if (header->hdr_msg_length == 0) { - ompi_osc_pt2pt_longreq_t *longreq; - ompi_osc_pt2pt_longreq_alloc(&longreq); - longreq->req_basereq.req_replyreq = replyreq; - - ret = ompi_osc_pt2pt_component_isend(replyreq->rep_target_convertor.pBaseBuf, - replyreq->rep_target_convertor.count, - replyreq->rep_target_datatype, - replyreq->rep_origin_rank, - header->hdr_target_tag, - module->p2p_comm, - &(longreq->req_pml_request), - ompi_osc_pt2pt_replyreq_send_long_cb, - longreq); - } - goto done; - - cleanup: - if (item != NULL) { - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, - item); - } - - done: - return ret; -} - - -/********************************************************************** - * - * Receive a put on the target side - * - **********************************************************************/ -static int -ompi_osc_pt2pt_sendreq_recv_put_long_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_longreq_t *longreq = - (ompi_osc_pt2pt_longreq_t*) request->req_complete_cb_data; - - OBJ_RELEASE(longreq->req_datatype); - ompi_osc_pt2pt_longreq_free(longreq); - - inmsg_mark_complete(longreq->req_module); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_sendreq_recv_put(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_send_header_t *header, - void *inbuf) -{ - int ret = OMPI_SUCCESS; - void *target = (unsigned char*) module->p2p_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->p2p_win->w_disp_unit); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); - struct ompi_datatype_t *datatype = - ompi_osc_base_datatype_create(proc, &inbuf); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->p2p_comm, 1, false); - } - - if (header->hdr_msg_length > 0) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - ompi_proc_t *proc; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* initialize convertor */ - proc = ompi_comm_peer_lookup(module->p2p_comm, header->hdr_origin); - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(datatype->super), - header->hdr_target_count, - target, - 0, - &convertor); - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)inbuf; - max_data = iov.iov_len; - /* - * Before copy to the user buffer, make the target part - * accessable. - */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &convertor); - ); - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data ); - /* Copy finished, make the user buffer unaccessable. */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &convertor); - ); - OBJ_DESTRUCT(&convertor); - OBJ_RELEASE(datatype); - inmsg_mark_complete(module); - } else { - ompi_osc_pt2pt_longreq_t *longreq; - ompi_osc_pt2pt_longreq_alloc(&longreq); - - longreq->req_datatype = datatype; - longreq->req_module = module; - - ret = ompi_osc_pt2pt_component_irecv(target, - header->hdr_target_count, - datatype, - header->hdr_origin, - header->hdr_origin_tag, - module->p2p_comm, - &(longreq->req_pml_request), - ompi_osc_pt2pt_sendreq_recv_put_long_cb, - longreq); - } - - return ret; -} - - -/********************************************************************** - * - * Receive an accumulate on the target side - * - **********************************************************************/ -static int -ompi_osc_pt2pt_sendreq_recv_accum_long_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_longreq_t *longreq = - (ompi_osc_pt2pt_longreq_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_module_t *module = longreq->req_module; - ompi_osc_pt2pt_send_header_t *header = longreq->req_basereq.req_sendhdr; - void *payload = (void*) (header + 1); - void *target = (unsigned char*) module->p2p_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->p2p_win->w_disp_unit); - - /* lock the window for accumulates */ - OPAL_THREAD_LOCK(&longreq->req_module->p2p_acc_lock); - - if (longreq->req_op == &ompi_mpi_op_replace.op) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(ompi_proc_local()->proc_convertor, - &(longreq->req_datatype->super), - header->hdr_target_count, - target, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*) payload; - max_data = iov.iov_len; - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - OBJ_DESTRUCT(&convertor); - } else { - /* - * Before copy to the user buffer, make the target part - * accessable. - */ - MEMCHECKER( - opal_memchecker_base_mem_defined( target, header->hdr_msg_length ); - ); - /* copy the data from the temporary buffer into the user window */ - ompi_osc_base_process_op(target, - payload, - header->hdr_msg_length, - longreq->req_datatype, - header->hdr_target_count, - longreq->req_op); - /* Copy finished, make the user buffer unaccessable. */ - MEMCHECKER( - opal_memchecker_base_mem_noaccess( target, header->hdr_msg_length ); - ); - } - - /* unlock the window for accumulates */ - OPAL_THREAD_UNLOCK(&longreq->req_module->p2p_acc_lock); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d finished receiving long accum message from %d", - ompi_comm_rank(longreq->req_module->p2p_comm), - header->hdr_origin)); - - /* free the temp buffer */ - free(longreq->req_basereq.req_sendhdr); - - /* Release datatype & op */ - OBJ_RELEASE(longreq->req_datatype); - OBJ_RELEASE(longreq->req_op); - - inmsg_mark_complete(longreq->req_module); - - ompi_osc_pt2pt_longreq_free(longreq); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_sendreq_recv_accum(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_send_header_t *header, - void *payload) -{ - int ret = OMPI_SUCCESS; - struct ompi_op_t *op = ompi_osc_base_op_create(header->hdr_target_op); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, header->hdr_origin ); - struct ompi_datatype_t *datatype = - ompi_osc_base_datatype_create(proc, &payload); - void *target = (unsigned char*) module->p2p_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->p2p_win->w_disp_unit); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->p2p_comm, 1, false); - } - - if (header->hdr_msg_length > 0) { - /* lock the window for accumulates */ - OPAL_THREAD_LOCK(&module->p2p_acc_lock); - - if (op == &ompi_mpi_op_replace.op) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(datatype->super), - header->hdr_target_count, - target, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)payload; - max_data = iov.iov_len; - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - OBJ_DESTRUCT(&convertor); - } else { - void *buffer = NULL; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch != ompi_proc_local()->proc_arch) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - struct ompi_datatype_t *primitive_datatype = NULL; - uint32_t primitive_count; - size_t buflen; - - ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); - primitive_count *= header->hdr_target_count; - - /* figure out how big a buffer we need */ - ompi_datatype_type_size(primitive_datatype, &buflen); - buflen *= primitive_count; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - buffer = (void*) malloc(buflen); - if (NULL == buffer) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(primitive_datatype->super), - primitive_count, - buffer, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)payload; - max_data = iov.iov_len; - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - OBJ_DESTRUCT(&convertor); - } else { - buffer = payload; - } -#else - buffer = payload; -#endif - /* - * Before copy to the user buffer, make the target part - * accessable. - */ - MEMCHECKER( - opal_memchecker_base_mem_defined( target, header->hdr_msg_length ); - ); - /* copy the data from the temporary buffer into the user window */ - ret = ompi_osc_base_process_op(target, - buffer, - header->hdr_msg_length, - datatype, - header->hdr_target_count, - op); - /* Copy finished, make the user buffer unaccessable. */ - MEMCHECKER( - opal_memchecker_base_mem_noaccess( target, header->hdr_msg_length ); - ); - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch != ompi_proc_local()->proc_arch) { - if (NULL == buffer) free(buffer); - } -#endif - } - - /* unlock the window for accumulates */ - OPAL_THREAD_UNLOCK(&module->p2p_acc_lock); - - /* Release datatype & op */ - OBJ_RELEASE(datatype); - OBJ_RELEASE(op); - - inmsg_mark_complete(module); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d received accum message from %d", - ompi_comm_rank(module->p2p_comm), - header->hdr_origin)); - } else { - ompi_osc_pt2pt_longreq_t *longreq; - size_t buflen; - struct ompi_datatype_t *primitive_datatype = NULL; - uint32_t primitive_count; - - /* get underlying type... */ - ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); - primitive_count *= header->hdr_target_count; - - /* figure out how big a buffer we need */ - ompi_datatype_type_size(primitive_datatype, &buflen); - buflen *= primitive_count; - - /* get a longreq and fill it in */ - ompi_osc_pt2pt_longreq_alloc(&longreq); - - longreq->req_datatype = datatype; - longreq->req_op = op; - longreq->req_module = module; - - /* allocate a buffer to receive into ... */ - longreq->req_basereq.req_sendhdr = (ompi_osc_pt2pt_send_header_t *) malloc(buflen + sizeof(ompi_osc_pt2pt_send_header_t)); - - if (NULL == longreq->req_basereq.req_sendhdr) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - /* fill in tmp header */ - memcpy(longreq->req_basereq.req_sendhdr, header, - sizeof(ompi_osc_pt2pt_send_header_t)); - ((ompi_osc_pt2pt_send_header_t*) longreq->req_basereq.req_sendhdr)->hdr_msg_length = buflen; - - ret = ompi_osc_pt2pt_component_irecv(((char*) longreq->req_basereq.req_sendhdr) + sizeof(ompi_osc_pt2pt_send_header_t), - primitive_count, - primitive_datatype, - header->hdr_origin, - header->hdr_origin_tag, - module->p2p_comm, - &(longreq->req_pml_request), - ompi_osc_pt2pt_sendreq_recv_accum_long_cb, - longreq); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d started long recv accum message from %d (%d)", - ompi_comm_rank(module->p2p_comm), - header->hdr_origin, - header->hdr_origin_tag)); - } - - return ret; -} - - -/********************************************************************** - * - * Recveive a get on the origin side - * - **********************************************************************/ -static int -ompi_osc_pt2pt_replyreq_recv_long_cb(ompi_request_t *request) -{ - ompi_osc_pt2pt_longreq_t *longreq = - (ompi_osc_pt2pt_longreq_t*) request->req_complete_cb_data; - ompi_osc_pt2pt_sendreq_t *sendreq = longreq->req_basereq.req_sendreq; - int32_t count; - - OPAL_THREAD_LOCK(&sendreq->req_module->p2p_lock); - count = (sendreq->req_module->p2p_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->p2p_lock); - - ompi_osc_pt2pt_longreq_free(longreq); - ompi_osc_pt2pt_sendreq_free(sendreq); - - if (0 == count) opal_condition_broadcast(&sendreq->req_module->p2p_cond); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - -int -ompi_osc_pt2pt_replyreq_recv(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t *sendreq, - ompi_osc_pt2pt_reply_header_t *header, - void *payload) -{ - int ret = OMPI_SUCCESS; - int32_t count; - - /* receive into user buffer */ - if (header->hdr_msg_length > 0) { - /* short message. woo! */ - - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)payload; - max_data = iov.iov_len; - /* - * Before copy to the target buffer, make the target part - * accessable. - */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &sendreq->req_origin_convertor); - ); - opal_convertor_unpack(&sendreq->req_origin_convertor, - &iov, - &iov_count, - &max_data ); - /* - * Copy finished, make the target buffer unaccessable.(Or just leave it accessable?) - */ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - - OPAL_THREAD_LOCK(&module->p2p_lock); - count = (sendreq->req_module->p2p_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - ompi_osc_pt2pt_sendreq_free(sendreq); - - if (0 == count) opal_condition_broadcast(&module->p2p_cond); - - } else { - ompi_osc_pt2pt_longreq_t *longreq; - ompi_osc_pt2pt_longreq_alloc(&longreq); - - longreq->req_basereq.req_sendreq = sendreq; - longreq->req_module = module; - - ret = ompi_osc_pt2pt_component_irecv(sendreq->req_origin_convertor.pBaseBuf, - sendreq->req_origin_convertor.count, - sendreq->req_origin_datatype, - sendreq->req_target_rank, - header->hdr_target_tag, - module->p2p_comm, - &(longreq->req_pml_request), - ompi_osc_pt2pt_replyreq_recv_long_cb, - longreq); - } - - return ret; -} - - -/********************************************************************** - * - * Control message communication - * - **********************************************************************/ -static int -ompi_osc_pt2pt_control_send_cb(ompi_request_t *request) -{ - opal_free_list_item_t *item = (opal_free_list_item_t*) request->req_complete_cb_data; - - /* release the descriptor and sendreq */ - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, item); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module, - ompi_proc_t *proc, - uint8_t type, int32_t value0, int32_t value1) -{ - int ret = OMPI_SUCCESS; - opal_free_list_item_t *item; - ompi_osc_pt2pt_buffer_t *buffer = NULL; - ompi_osc_pt2pt_control_header_t *header = NULL; - int rank = -1, i; - - /* find the rank */ - for (i = 0 ; i < ompi_comm_size(module->p2p_comm) ; ++i) { - if (proc == ompi_comm_peer_lookup(module->p2p_comm, i)) { - rank = i; - } - } - - /* Get a buffer */ - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_buffers, - item, ret); - if (NULL == item) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - buffer = (ompi_osc_pt2pt_buffer_t*) item; - - /* verify at least enough space for header */ - if (mca_osc_pt2pt_component.p2p_c_eager_size < sizeof(ompi_osc_pt2pt_control_header_t)) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - - /* setup buffer */ - buffer->data = NULL; - buffer->len = sizeof(ompi_osc_pt2pt_control_header_t); - - /* pack header */ - header = (ompi_osc_pt2pt_control_header_t*) buffer->payload; - header->hdr_base.hdr_type = type; - header->hdr_base.hdr_flags = 0; - header->hdr_value[0] = value0; - header->hdr_value[1] = value1; - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_PT2PT_HDR_FLAG_NBO; - OMPI_OSC_PT2PT_CONTROL_HDR_HTON(*header); - } -#endif - - /* send fragment */ - ret = ompi_osc_pt2pt_component_isend(buffer->payload, - buffer->len, - MPI_BYTE, - rank, - CONTROL_MSG_TAG, - module->p2p_comm, - &buffer->request, - ompi_osc_pt2pt_control_send_cb, - buffer); - goto done; - - cleanup: - if (item != NULL) { - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_buffers, - item); - } - - done: - return ret; -} diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.h b/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.h deleted file mode 100644 index 8732434561..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_data_move.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_MCA_OSC_PT2PT_DATA_MOVE_H -#define OMPI_MCA_OSC_PT2PT_DATA_MOVE_H - -#include "osc_pt2pt_sendreq.h" -#include "osc_pt2pt_replyreq.h" - -/* send a sendreq (the request from the origin for a Put, Get, or - Accumulate, including the payload for Put and Accumulate) */ -int ompi_osc_pt2pt_sendreq_send(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t *sendreq); - -/* send a replyreq (the request from the target of a Get, with the - payload for the origin */ -int ompi_osc_pt2pt_replyreq_send(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_replyreq_t *replyreq); - -/* receive the target side of a sendreq for a put, directly into the user's window */ -int ompi_osc_pt2pt_sendreq_recv_put(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_send_header_t *header, - void *payload); - -/* receive the target side of a sendreq for an accumulate, possibly - using a temproart buffer, then calling the reduction functions */ -int ompi_osc_pt2pt_sendreq_recv_accum(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_send_header_t *header, - void *payload); - -/* receive the origin side of a replyreq (the reply part of an - MPI_Get), directly into the user's window */ -int ompi_osc_pt2pt_replyreq_recv(ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t *sendreq, - ompi_osc_pt2pt_reply_header_t *header, - void *payload); - -int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module, - ompi_proc_t *proc, - uint8_t type, int32_t value0, int32_t value1); - -#endif diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_header.h b/ompi/mca/osc/pt2pt/osc_pt2pt_header.h deleted file mode 100644 index e080333595..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_header.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_MCA_OSC_PT2PT_HDR_H -#define OMPI_MCA_OSC_PT2PT_HDR_H - -#ifdef HAVE_NETINET_IN_H -#include -#endif - -#include "opal/types.h" - -#define OMPI_OSC_PT2PT_HDR_PUT 0x0001 -#define OMPI_OSC_PT2PT_HDR_ACC 0x0002 -#define OMPI_OSC_PT2PT_HDR_GET 0x0003 -#define OMPI_OSC_PT2PT_HDR_REPLY 0x0004 -#define OMPI_OSC_PT2PT_HDR_POST 0x0005 -#define OMPI_OSC_PT2PT_HDR_COMPLETE 0x0006 -#define OMPI_OSC_PT2PT_HDR_LOCK_REQ 0x0007 -#define OMPI_OSC_PT2PT_HDR_UNLOCK_REQ 0x0008 -#define OMPI_OSC_PT2PT_HDR_UNLOCK_REPLY 0x0009 - -#define OMPI_OSC_PT2PT_HDR_FLAG_NBO 0x0001 - -struct ompi_osc_pt2pt_base_header_t { - uint8_t hdr_type; - uint8_t hdr_flags; -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - uint8_t padding[2]; -#endif -}; -typedef struct ompi_osc_pt2pt_base_header_t ompi_osc_pt2pt_base_header_t; - -#define OMPI_OSC_PT2PT_BASE_HDR_NTOH(h) -#define OMPI_OSC_PT2PT_BASE_HDR_HTON(h) - -struct ompi_osc_pt2pt_send_header_t { - ompi_osc_pt2pt_base_header_t hdr_base; - - int32_t hdr_origin; - ompi_ptr_t hdr_origin_sendreq; - int32_t hdr_origin_tag; - - uint64_t hdr_target_disp; - int32_t hdr_target_count; - int32_t hdr_target_op; - - int32_t hdr_msg_length; /* 0 if payload is not included */ -}; -typedef struct ompi_osc_pt2pt_send_header_t ompi_osc_pt2pt_send_header_t; - -#define OMPI_OSC_PT2PT_SEND_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_origin = htonl((hdr).hdr_origin); \ - (hdr).hdr_origin_tag = htonl((hdr).hdr_origin_tag); \ - (hdr).hdr_target_disp = hton64((hdr).hdr_target_disp); \ - (hdr).hdr_target_count = htonl((hdr).hdr_target_count); \ - (hdr).hdr_target_op = htonl((hdr).hdr_target_op); \ - (hdr).hdr_msg_length = htonl((hdr).hdr_msg_length); \ - } while (0) - -#define OMPI_OSC_PT2PT_SEND_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_origin = ntohl((hdr).hdr_origin); \ - (hdr).hdr_origin_tag = ntohl((hdr).hdr_origin_tag); \ - (hdr).hdr_target_disp = ntoh64((hdr).hdr_target_disp); \ - (hdr).hdr_target_count = ntohl((hdr).hdr_target_count); \ - (hdr).hdr_target_op = ntohl((hdr).hdr_target_op); \ - (hdr).hdr_msg_length = ntohl((hdr).hdr_msg_length); \ - } while (0) - - -struct ompi_osc_pt2pt_reply_header_t { - ompi_osc_pt2pt_base_header_t hdr_base; - int32_t hdr_target_tag; - ompi_ptr_t hdr_origin_sendreq; - int32_t hdr_msg_length; -}; -typedef struct ompi_osc_pt2pt_reply_header_t ompi_osc_pt2pt_reply_header_t; - -#define OMPI_OSC_PT2PT_REPLY_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_target_tag = htonl((hdr).hdr_target_tag); \ - (hdr).hdr_msg_length = htonl((hdr).hdr_msg_length); \ - } while (0) - -#define OMPI_OSC_PT2PT_REPLY_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_target_tag = ntohl((hdr).hdr_target_tag); \ - (hdr).hdr_msg_length = ntohl((hdr).hdr_msg_length); \ - } while (0) - - -struct ompi_osc_pt2pt_control_header_t { - ompi_osc_pt2pt_base_header_t hdr_base; - int32_t hdr_value[2]; -}; -typedef struct ompi_osc_pt2pt_control_header_t ompi_osc_pt2pt_control_header_t; - -#define OMPI_OSC_PT2PT_CONTROL_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_value[0] = htonl((hdr).hdr_value[0]); \ - (hdr).hdr_value[1] = htonl((hdr).hdr_value[1]); \ - } while (0) - -#define OMPI_OSC_PT2PT_CONTROL_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_PT2PT_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_value[0] = ntohl((hdr).hdr_value[0]); \ - (hdr).hdr_value[1] = ntohl((hdr).hdr_value[1]); \ - } while (0) - -#endif /* OMPI_MCA_OSC_PT2PT_HDR_H */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.c b/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.c deleted file mode 100644 index 9071318311..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt_longreq.h" - -OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_longreq_t, opal_free_list_item_t, - NULL, NULL); - diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.h b/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.h deleted file mode 100644 index b94c8da95e..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_longreq.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OSC_PT2PT_LONGREQ_H -#define OSC_PT2PT_LONGREQ_H - -#include "opal/class/opal_free_list.h" - -#include "osc_pt2pt.h" - -struct ompi_osc_pt2pt_longreq_t { - opal_free_list_item_t super; - - struct ompi_request_t *req_pml_request; /* PML request */ - - union { - struct ompi_osc_pt2pt_sendreq_t *req_sendreq; - struct ompi_osc_pt2pt_replyreq_t *req_replyreq; - struct ompi_osc_pt2pt_send_header_t *req_sendhdr; - } req_basereq; - - /* This may not always be filled in... */ - struct ompi_osc_pt2pt_module_t *req_module; - struct ompi_op_t *req_op; - struct ompi_datatype_t *req_datatype; -}; -typedef struct ompi_osc_pt2pt_longreq_t ompi_osc_pt2pt_longreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_longreq_t); - -static inline int -ompi_osc_pt2pt_longreq_alloc(ompi_osc_pt2pt_longreq_t **longreq) -{ - opal_free_list_item_t *item; - int ret; - - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_longreqs, - item, ret); - - *longreq = (ompi_osc_pt2pt_longreq_t*) item; - return ret; -} - -static inline int -ompi_osc_pt2pt_longreq_free(ompi_osc_pt2pt_longreq_t *longreq) -{ - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_longreqs, - &longreq->super); - return OMPI_SUCCESS; -} - -#endif diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.c b/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.c deleted file mode 100644 index 04829bb9db..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt_replyreq.h" - -#include "opal/class/opal_list.h" -#include "opal/datatype/opal_convertor.h" - -int -ompi_osc_pt2pt_replyreq_alloc_init(ompi_osc_pt2pt_module_t *module, - int origin, - ompi_ptr_t origin_request, - OPAL_PTRDIFF_TYPE target_displacement, - int target_count, - struct ompi_datatype_t *datatype, - ompi_osc_pt2pt_replyreq_t **replyreq) -{ - int ret; - void *target_addr = (unsigned char*) module->p2p_win->w_baseptr + - (target_displacement * module->p2p_win->w_disp_unit); - - - /* allocate a replyreq */ - ret = ompi_osc_pt2pt_replyreq_alloc(module, - origin, - replyreq); - if (OMPI_SUCCESS != ret) return ret; - - /* initialize local side of replyreq */ - ret = ompi_osc_pt2pt_replyreq_init_target(*replyreq, - target_addr, - target_count, - datatype); - if (OMPI_SUCCESS != ret) { - ompi_osc_pt2pt_replyreq_free(*replyreq); - return ret; - } - - /* initialize remote side of replyreq */ - ret = ompi_osc_pt2pt_replyreq_init_origin(*replyreq, - origin_request); - if (OMPI_SUCCESS != ret) { - ompi_osc_pt2pt_replyreq_free(*replyreq); - return ret; - } - - return OMPI_SUCCESS; -} - - -static void ompi_osc_pt2pt_replyreq_construct(ompi_osc_pt2pt_replyreq_t *replyreq) -{ - OBJ_CONSTRUCT(&(replyreq->rep_target_convertor), opal_convertor_t); -} - -static void ompi_osc_pt2pt_replyreq_destruct(ompi_osc_pt2pt_replyreq_t *replyreq) -{ - OBJ_DESTRUCT(&(replyreq->rep_target_convertor)); -} - - -OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_replyreq_t, opal_list_item_t, - ompi_osc_pt2pt_replyreq_construct, - ompi_osc_pt2pt_replyreq_destruct); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h b/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h deleted file mode 100644 index f45070e99a..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_replyreq.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_PT2PT_REPLYREQ_H -#define OMPI_OSC_PT2PT_REPLYREQ_H - -#include "osc_pt2pt.h" -#include "osc_pt2pt_longreq.h" - -#include "opal/types.h" -#include "opal/class/opal_list.h" -#include "ompi/datatype/ompi_datatype.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/communicator/communicator.h" -#include "ompi/proc/proc.h" -#include "ompi/memchecker.h" - - -struct ompi_osc_pt2pt_replyreq_t { - opal_list_item_t super; - - /** pointer to the module that created the replyreq */ - ompi_osc_pt2pt_module_t *rep_module; - - /** Datatype for the target side of the operation */ - struct ompi_datatype_t *rep_target_datatype; - /** Convertor for the target. Always setup for send. */ - opal_convertor_t rep_target_convertor; - /** packed size of message on the target side */ - size_t rep_target_bytes_packed; - - /** rank in module's communicator for origin of operation */ - int rep_origin_rank; - /** pointer to the proc structure for the origin of the operation */ - ompi_proc_t *rep_origin_proc; - - ompi_ptr_t rep_origin_sendreq; -}; -typedef struct ompi_osc_pt2pt_replyreq_t ompi_osc_pt2pt_replyreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_replyreq_t); - - -/** allocate and populate a replyreq structure. datatype is - RETAINed for the life of the replyreq */ -int -ompi_osc_pt2pt_replyreq_alloc_init(ompi_osc_pt2pt_module_t *module, - int origin, - ompi_ptr_t origin_request, - OPAL_PTRDIFF_TYPE target_displacement, - int target_count, - struct ompi_datatype_t *datatype, - ompi_osc_pt2pt_replyreq_t **replyreq); - - -static inline int -ompi_osc_pt2pt_replyreq_alloc(ompi_osc_pt2pt_module_t *module, - int origin_rank, - ompi_osc_pt2pt_replyreq_t **replyreq) -{ - int ret; - opal_free_list_item_t *item; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, origin_rank ); - - /* BWB - FIX ME - is this really the right return code? */ - if (NULL == proc) return OMPI_ERR_OUT_OF_RESOURCE; - - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_replyreqs, - item, ret); - if (OMPI_SUCCESS != ret) return ret; - *replyreq = (ompi_osc_pt2pt_replyreq_t*) item; - - (*replyreq)->rep_module = module; - (*replyreq)->rep_origin_rank = origin_rank; - (*replyreq)->rep_origin_proc = proc; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_replyreq_init_target(ompi_osc_pt2pt_replyreq_t *replyreq, - void *target_addr, - int target_count, - struct ompi_datatype_t *target_dt) -{ - OBJ_RETAIN(target_dt); - replyreq->rep_target_datatype = target_dt; - - opal_convertor_copy_and_prepare_for_send(replyreq->rep_origin_proc->proc_convertor, - &(target_dt->super), - target_count, - target_addr, - 0, - &(replyreq->rep_target_convertor)); - opal_convertor_get_packed_size(&replyreq->rep_target_convertor, - &replyreq->rep_target_bytes_packed); - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_replyreq_init_origin(ompi_osc_pt2pt_replyreq_t *replyreq, - ompi_ptr_t origin_request) -{ - replyreq->rep_origin_sendreq = origin_request; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_replyreq_free(ompi_osc_pt2pt_replyreq_t *replyreq) -{ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &replyreq->rep_target_convertor); - ); - opal_convertor_cleanup(&replyreq->rep_target_convertor); - - OBJ_RELEASE(replyreq->rep_target_datatype); - - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_replyreqs, - (opal_list_item_t*) replyreq); - - return OMPI_SUCCESS; -} - -#endif /* OMPI_OSC_PT2PT_REPLYREQ_H */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.c deleted file mode 100644 index 59b41ff546..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt_sendreq.h" - -#include "opal/datatype/opal_convertor.h" - - -int -ompi_osc_pt2pt_sendreq_alloc_init(ompi_osc_pt2pt_req_type_t req_type, - void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t **sendreq) -{ - int ret; - - /* allocate a sendreq */ - ret = ompi_osc_pt2pt_sendreq_alloc(module, target, - sendreq); - if (OMPI_SUCCESS != ret) return ret; - - /* initialize local side of sendreq */ - ret = ompi_osc_pt2pt_sendreq_init_origin(*sendreq, - req_type, - origin_addr, - origin_count, - origin_dt); - if (OMPI_SUCCESS != ret) { - ompi_osc_pt2pt_sendreq_free(*sendreq); - return ret; - } - - /* initialize remote side of sendreq */ - ret = ompi_osc_pt2pt_sendreq_init_target(*sendreq, - target_disp, - target_count, - target_dt); - if (OMPI_SUCCESS != ret) { - ompi_osc_pt2pt_sendreq_free(*sendreq); - return ret; - } - - return OMPI_SUCCESS; -} - - -static void ompi_osc_pt2pt_sendreq_construct(ompi_osc_pt2pt_sendreq_t *req) -{ - req->super.req_type = OMPI_REQUEST_WIN; - req->super.req_free = NULL; - req->super.req_cancel = NULL; - OBJ_CONSTRUCT(&(req->req_origin_convertor), opal_convertor_t); -} - - -static void ompi_osc_pt2pt_sendreq_destruct(ompi_osc_pt2pt_sendreq_t *req) -{ - OBJ_DESTRUCT(&(req->req_origin_convertor)); -} - - -OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_sendreq_t, ompi_request_t, - ompi_osc_pt2pt_sendreq_construct, - ompi_osc_pt2pt_sendreq_destruct); diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h b/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h deleted file mode 100644 index 104e54471a..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sendreq.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_PT2PT_SENDREQ_H -#define OMPI_OSC_PT2PT_SENDREQ_H - -#include "osc_pt2pt.h" -#include "osc_pt2pt_longreq.h" - -#include "opal/class/opal_list.h" -#include "ompi/datatype/ompi_datatype.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/communicator/communicator.h" -#include "ompi/proc/proc.h" -#include "ompi/memchecker.h" - -typedef enum { - OMPI_OSC_PT2PT_GET, - OMPI_OSC_PT2PT_ACC, - OMPI_OSC_PT2PT_PUT -} ompi_osc_pt2pt_req_type_t; - - -struct ompi_osc_pt2pt_sendreq_t { - ompi_request_t super; - - /** type of sendreq (from ompi_osc_pt2pt_req_type_t) */ - ompi_osc_pt2pt_req_type_t req_type; - /** pointer to the module that created the sendreq */ - ompi_osc_pt2pt_module_t *req_module; - - /** Datatype for the origin side of the operation */ - struct ompi_datatype_t *req_origin_datatype; - /** Convertor for the origin side of the operation. Setup for - either send (Put / Accumulate) or receive (Get) */ - opal_convertor_t req_origin_convertor; - /** packed size of message on the origin side */ - size_t req_origin_bytes_packed; - - /** rank in module's communicator for target of operation */ - int req_target_rank; - /** pointer to the proc structure for the target of the operation */ - ompi_proc_t *req_target_proc; - - /** displacement on target */ - OPAL_PTRDIFF_TYPE req_target_disp; - /** datatype count on target */ - int req_target_count; - /** datatype on target */ - struct ompi_datatype_t *req_target_datatype; - - /** op index on the target */ - int req_op_id; -}; -typedef struct ompi_osc_pt2pt_sendreq_t ompi_osc_pt2pt_sendreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_sendreq_t); - - -/** allocate and populate a sendreq structure. Both datatypes are - RETAINed for the life of the sendreq */ -int -ompi_osc_pt2pt_sendreq_alloc_init(ompi_osc_pt2pt_req_type_t req_type, - void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_datatype, - ompi_osc_pt2pt_module_t *module, - ompi_osc_pt2pt_sendreq_t **sendreq); - -static inline int -ompi_osc_pt2pt_sendreq_alloc(ompi_osc_pt2pt_module_t *module, - int target_rank, - ompi_osc_pt2pt_sendreq_t **sendreq) -{ - int ret; - opal_free_list_item_t *item; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target_rank ); - - /* BWB - FIX ME - is this really the right return code? */ - if (NULL == proc) return OMPI_ERR_OUT_OF_RESOURCE; - - OPAL_FREE_LIST_GET(&mca_osc_pt2pt_component.p2p_c_sendreqs, - item, ret); - if (OMPI_SUCCESS != ret) return ret; - *sendreq = (ompi_osc_pt2pt_sendreq_t*) item; - - (*sendreq)->req_module = module; - (*sendreq)->req_target_rank = target_rank; - (*sendreq)->req_target_proc = proc; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_sendreq_init_origin(ompi_osc_pt2pt_sendreq_t *sendreq, - ompi_osc_pt2pt_req_type_t req_type, - void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt) -{ - OBJ_RETAIN(origin_dt); - sendreq->req_origin_datatype = origin_dt; - sendreq->req_type = req_type; - - if (req_type != OMPI_OSC_PT2PT_GET) { - opal_convertor_copy_and_prepare_for_send(sendreq->req_target_proc->proc_convertor, - &(origin_dt->super), - origin_count, - origin_addr, - 0, - &(sendreq->req_origin_convertor)); - opal_convertor_get_packed_size(&sendreq->req_origin_convertor, - &sendreq->req_origin_bytes_packed); - } else { - opal_convertor_copy_and_prepare_for_recv(sendreq->req_target_proc->proc_convertor, - &(origin_dt->super), - origin_count, - origin_addr, - 0, - &(sendreq->req_origin_convertor)); - opal_convertor_get_packed_size(&sendreq->req_origin_convertor, - &sendreq->req_origin_bytes_packed); - } - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_sendreq_init_target(ompi_osc_pt2pt_sendreq_t *sendreq, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_datatype) -{ - OBJ_RETAIN(target_datatype); - - sendreq->req_target_disp = target_disp; - sendreq->req_target_count = target_count; - sendreq->req_target_datatype = target_datatype; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_pt2pt_sendreq_free(ompi_osc_pt2pt_sendreq_t *sendreq) -{ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &sendreq->req_origin_convertor); - ); - opal_convertor_cleanup(&sendreq->req_origin_convertor); - - OBJ_RELEASE(sendreq->req_target_datatype); - OBJ_RELEASE(sendreq->req_origin_datatype); - - OPAL_FREE_LIST_RETURN(&mca_osc_pt2pt_component.p2p_c_sendreqs, - (opal_list_item_t*) sendreq); - - return OMPI_SUCCESS; -} - -#endif /* OMPI_OSC_PT2PT_SENDREQ_H */ diff --git a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c b/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c deleted file mode 100644 index ea806d7131..0000000000 --- a/ompi/mca/osc/pt2pt/osc_pt2pt_sync.c +++ /dev/null @@ -1,698 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_pt2pt.h" -#include "osc_pt2pt_sendreq.h" -#include "osc_pt2pt_header.h" -#include "osc_pt2pt_data_move.h" - -#include "mpi.h" -#include "opal/runtime/opal_progress.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/osc/base/base.h" - - -/* Must hold module's lock before calling... */ -static inline void -ompi_osc_pt2pt_flip_sendreqs(ompi_osc_pt2pt_module_t *module) -{ - unsigned int *tmp; - - tmp = module->p2p_copy_num_pending_sendreqs; - module->p2p_copy_num_pending_sendreqs = - module->p2p_num_pending_sendreqs; - module->p2p_num_pending_sendreqs = tmp; - memset(module->p2p_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->p2p_comm)); - - /* Copy in all the pending requests */ - opal_list_join(&module->p2p_copy_pending_sendreqs, - opal_list_get_end(&module->p2p_copy_pending_sendreqs), - &module->p2p_pending_sendreqs); -} - - -int -ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) -{ - unsigned int incoming_reqs; - int ret = OMPI_SUCCESS, i; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - int num_outgoing = 0; - - if (0 != (assert & MPI_MODE_NOPRECEDE)) { - /* check that the user didn't lie to us - since NOPRECEDED - must be specified by all processes if it is specified by - any process, if we see this it is safe to assume that there - are no pending operations anywhere needed to close out this - epoch. No need to lock, since it's a lookup and any - pending modification of the pending_sendreqs during this - time is an erroneous program. */ - if (0 != opal_list_get_size(&(module->p2p_pending_sendreqs))) { - return MPI_ERR_RMA_SYNC; - } - - } else { - opal_list_item_t *item; - - /* "atomically" copy all the data we're going to be modifying - into the copy... */ - OPAL_THREAD_LOCK(&(module->p2p_lock)); - ompi_osc_pt2pt_flip_sendreqs(module); - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - num_outgoing = opal_list_get_size(&(module->p2p_copy_pending_sendreqs)); - - /* find out how much data everyone is going to send us. */ - ret = module->p2p_comm-> - c_coll.coll_reduce_scatter(module->p2p_copy_num_pending_sendreqs, - &incoming_reqs, - module->p2p_fence_coll_counts, - MPI_UNSIGNED, - MPI_SUM, - module->p2p_comm, - module->p2p_comm->c_coll.coll_reduce_scatter_module); - - if (OMPI_SUCCESS != ret) { - /* put the stupid data back for the user. This is not - cheap, but the user lost his data if we don't. */ - OPAL_THREAD_LOCK(&(module->p2p_lock)); - opal_list_join(&module->p2p_pending_sendreqs, - opal_list_get_end(&module->p2p_pending_sendreqs), - &module->p2p_copy_pending_sendreqs); - - for (i = 0 ; i < ompi_comm_size(module->p2p_comm) ; ++i) { - module->p2p_num_pending_sendreqs[i] += - module->p2p_copy_num_pending_sendreqs[i]; - } - - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - return ret; - } - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "fence: waiting on %d in and %d out", - module->p2p_num_pending_in, - module->p2p_num_pending_out)); - - /* try to start all the requests. We've copied everything we - need out of pending_sendreqs, so don't need the lock - here */ - while (NULL != - (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { - ompi_osc_pt2pt_sendreq_t *req = - (ompi_osc_pt2pt_sendreq_t*) item; - - ret = ompi_osc_pt2pt_sendreq_send(module, req); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_output_verbose(5, ompi_osc_base_framework.framework_output, - "complete: failure in starting sendreq (%d). Will try later.", - ret); - opal_list_append(&(module->p2p_copy_pending_sendreqs), item); - } else if (OMPI_SUCCESS != ret) { - return ret; - } - } - - OPAL_THREAD_LOCK(&module->p2p_lock); - /* possible we've already received a couple in messages, so - add however many we're going to wait for */ - module->p2p_num_pending_in += incoming_reqs; - module->p2p_num_pending_out += num_outgoing; - - /* now we know how many things we're waiting for - wait for them... */ - while (module->p2p_num_pending_in > 0 || - 0 != module->p2p_num_pending_out) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - OPAL_THREAD_UNLOCK(&module->p2p_lock); - } - - /* all transfers are done - back to the real world we go */ - if (0 == (assert & MPI_MODE_NOSUCCEED)) { - ompi_win_set_mode(win, OMPI_WIN_FENCE); - } else { - ompi_win_set_mode(win, 0); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_module_start(ompi_group_t *group, - int assert, - ompi_win_t *win) -{ - int i, ret = OMPI_SUCCESS; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - - OBJ_RETAIN(group); - ompi_group_increment_proc_count(group); - - OPAL_THREAD_LOCK(&(module->p2p_lock)); - if (NULL != module->p2p_sc_group) { - OPAL_THREAD_UNLOCK(&module->p2p_lock); - ret = MPI_ERR_RMA_SYNC; - goto cleanup; - } - module->p2p_sc_group = group; - - /* possible we've already received a couple in messages, so - add however many we're going to wait for */ - module->p2p_num_post_msgs += ompi_group_size(module->p2p_sc_group); - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - memset(module->p2p_sc_remote_active_ranks, 0, - sizeof(bool) * ompi_comm_size(module->p2p_comm)); - - /* for each process in the specified group, find it's rank in our - communicator, store those indexes, and set the true / false in - the active ranks table */ - for (i = 0 ; i < ompi_group_size(group) ; i++) { - int comm_rank = -1, j; - - /* find the rank in the communicator associated with this windows */ - for (j = 0 ; j < ompi_comm_size(module->p2p_comm) ; ++j) { - if (ompi_group_peer_lookup(module->p2p_sc_group, i) == - ompi_comm_peer_lookup(module->p2p_comm, j)) { - comm_rank = j; - break; - } - } - if (comm_rank == -1) { - ret = MPI_ERR_RMA_SYNC; - goto cleanup; - } - - module->p2p_sc_remote_active_ranks[comm_rank] = true; - module->p2p_sc_remote_ranks[i] = comm_rank; - } - - /* Set our mode to access w/ start */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); - - return OMPI_SUCCESS; - - cleanup: - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - return ret; -} - - -int -ompi_osc_pt2pt_module_complete(ompi_win_t *win) -{ - int i; - int ret = OMPI_SUCCESS; - ompi_group_t *group; - opal_list_item_t *item; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - - /* wait for all the post messages */ - OPAL_THREAD_LOCK(&module->p2p_lock); - while (0 != module->p2p_num_post_msgs) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - - ompi_osc_pt2pt_flip_sendreqs(module); - - /* for each process in group, send a control message with number - of updates coming, then start all the requests */ - for (i = 0 ; i < ompi_group_size(module->p2p_sc_group) ; ++i) { - int comm_rank = module->p2p_sc_remote_ranks[i]; - - module->p2p_num_pending_out += - module->p2p_copy_num_pending_sendreqs[comm_rank]; - } - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - for (i = 0 ; i < ompi_group_size(module->p2p_sc_group) ; ++i) { - int comm_rank = module->p2p_sc_remote_ranks[i]; - ret = ompi_osc_pt2pt_control_send(module, - ompi_group_peer_lookup(module->p2p_sc_group, i), - OMPI_OSC_PT2PT_HDR_COMPLETE, - module->p2p_copy_num_pending_sendreqs[comm_rank], - 0); - assert(ret == OMPI_SUCCESS); - } - - /* try to start all the requests. We've copied everything we - need out of pending_sendreqs, so don't need the lock - here */ - while (NULL != - (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { - ompi_osc_pt2pt_sendreq_t *req = - (ompi_osc_pt2pt_sendreq_t*) item; - - ret = ompi_osc_pt2pt_sendreq_send(module, req); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_output_verbose(5, ompi_osc_base_framework.framework_output, - "complete: failure in starting sendreq (%d). Will try later.", - ret); - opal_list_append(&(module->p2p_copy_pending_sendreqs), item); - } else if (OMPI_SUCCESS != ret) { - return ret; - } - } - - /* wait for all the requests */ - OPAL_THREAD_LOCK(&module->p2p_lock); - while (0 != module->p2p_num_pending_out) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - - group = module->p2p_sc_group; - module->p2p_sc_group = NULL; - - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - /* remove WIN_POSTED from our mode */ - ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return ret; -} - - -int -ompi_osc_pt2pt_module_post(ompi_group_t *group, - int assert, - ompi_win_t *win) -{ - int i; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - - OBJ_RETAIN(group); - ompi_group_increment_proc_count(group); - - OPAL_THREAD_LOCK(&(module->p2p_lock)); - assert(NULL == module->p2p_pw_group); - module->p2p_pw_group = group; - - /* Set our mode to expose w/ post */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - /* list how many complete counters we're still waiting on */ - module->p2p_num_complete_msgs += - ompi_group_size(module->p2p_pw_group); - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - /* send a hello counter to everyone in group */ - for (i = 0 ; i < ompi_group_size(module->p2p_pw_group) ; ++i) { - ompi_osc_pt2pt_control_send(module, - ompi_group_peer_lookup(group, i), - OMPI_OSC_PT2PT_HDR_POST, 1, 0); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_module_wait(ompi_win_t *win) -{ - ompi_group_t *group; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - - OPAL_THREAD_LOCK(&module->p2p_lock); - while (0 != (module->p2p_num_pending_in) || - 0 != (module->p2p_num_complete_msgs)) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - - group = module->p2p_pw_group; - module->p2p_pw_group = NULL; - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_module_test(ompi_win_t *win, - int *flag) -{ - ompi_group_t *group; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - -#if !OMPI_ENABLE_PROGRESS_THREADS - opal_progress(); -#endif - - if (0 != (module->p2p_num_pending_in) || - 0 != (module->p2p_num_complete_msgs)) { - *flag = 0; - return OMPI_SUCCESS; - } - - *flag = 1; - - ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - OPAL_THREAD_LOCK(&(module->p2p_lock)); - group = module->p2p_pw_group; - module->p2p_pw_group = NULL; - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return OMPI_SUCCESS; -} - - -struct ompi_osc_pt2pt_pending_lock_t { - opal_list_item_t super; - ompi_proc_t *proc; - int32_t lock_type; -}; -typedef struct ompi_osc_pt2pt_pending_lock_t ompi_osc_pt2pt_pending_lock_t; -OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_lock_t, opal_list_item_t, - NULL, NULL); - - -int -ompi_osc_pt2pt_module_lock(int lock_type, - int target, - int assert, - ompi_win_t *win) -{ - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target ); - - assert(lock_type != 0); - - /* set our mode on the window */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: sending lock request to %d", - ompi_comm_rank(module->p2p_comm), - target)); - /* generate a lock request */ - ompi_osc_pt2pt_control_send(module, - proc, - OMPI_OSC_PT2PT_HDR_LOCK_REQ, - ompi_comm_rank(module->p2p_comm), - lock_type); - - if (ompi_comm_rank(module->p2p_comm) == target) { - /* If we're trying to lock locally, have to wait to actually - acquire the lock */ - OPAL_THREAD_LOCK(&module->p2p_lock); - while (module->p2p_lock_received_ack == 0) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - OPAL_THREAD_UNLOCK(&module->p2p_lock); - } - - /* return */ - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_module_unlock(int target, - ompi_win_t *win) -{ - int32_t out_count; - opal_list_item_t *item; - int ret; - ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target ); - - OPAL_THREAD_LOCK(&module->p2p_lock); - while (0 == module->p2p_lock_received_ack) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - - module->p2p_lock_received_ack -= 1; - - /* start all the requests */ - ompi_osc_pt2pt_flip_sendreqs(module); - - /* try to start all the requests. We've copied everything we need - out of pending_sendreqs, so don't need the lock here */ - out_count = opal_list_get_size(&(module->p2p_copy_pending_sendreqs)); - - /* we want to send all the requests, plus we wait for one more - completion event for the control message ack from the unlocker - saying we're done */ - module->p2p_num_pending_out += (out_count + 1); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - /* send the unlock request */ - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: sending unlock request to %d with %d requests", - ompi_comm_rank(module->p2p_comm), target, - out_count)); - ompi_osc_pt2pt_control_send(module, - proc, - OMPI_OSC_PT2PT_HDR_UNLOCK_REQ, - ompi_comm_rank(module->p2p_comm), - out_count); - - while (NULL != - (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { - ompi_osc_pt2pt_sendreq_t *req = - (ompi_osc_pt2pt_sendreq_t*) item; - - ret = ompi_osc_pt2pt_sendreq_send(module, req); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_output_verbose(5, ompi_osc_base_framework.framework_output, - "complete: failure in starting sendreq (%d). Will try later.", - ret); - opal_list_append(&(module->p2p_copy_pending_sendreqs), item); - } else if (OMPI_SUCCESS != ret) { - return ret; - } - } - - /* wait for all the requests */ - OPAL_THREAD_LOCK(&module->p2p_lock); - while (0 != module->p2p_num_pending_out) { - opal_condition_wait(&module->p2p_cond, &module->p2p_lock); - } - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: finished unlock to %d", - ompi_comm_rank(module->p2p_comm), target)); - - /* set our mode on the window */ - ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_passive_lock(ompi_osc_pt2pt_module_t *module, - int32_t origin, - int32_t lock_type) -{ - bool send_ack = false; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, origin ); - ompi_osc_pt2pt_pending_lock_t *new_pending; - - OPAL_THREAD_LOCK(&(module->p2p_lock)); - if (lock_type == MPI_LOCK_EXCLUSIVE) { - if (module->p2p_lock_status == 0) { - module->p2p_lock_status = MPI_LOCK_EXCLUSIVE; - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: setting lock status to EXCLUSIVE (from %d)", - ompi_comm_rank(module->p2p_comm), origin)); - ompi_win_append_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); - send_ack = true; - } else { - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: queuing lock request from %d (type=%d)", - ompi_comm_rank(module->p2p_comm), origin, lock_type)); - new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = lock_type; - opal_list_append(&(module->p2p_locks_pending), &(new_pending->super)); - } - } else if (lock_type == MPI_LOCK_SHARED) { - if (module->p2p_lock_status != MPI_LOCK_EXCLUSIVE) { - module->p2p_lock_status = MPI_LOCK_SHARED; - module->p2p_shared_count++; - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: setting lock status to SHARED (from %d), count %d", - ompi_comm_rank(module->p2p_comm), origin, module->p2p_shared_count)); - ompi_win_append_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); - send_ack = true; - } else { - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: queuing lock request from %d (type=%d)", - ompi_comm_rank(module->p2p_comm), origin, lock_type)); - new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = lock_type; - opal_list_append(&(module->p2p_locks_pending), &(new_pending->super)); - } - } - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - if (send_ack) { - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: sending lock ack to %d", - ompi_comm_rank(module->p2p_comm), origin)); - ompi_osc_pt2pt_control_send(module, proc, - OMPI_OSC_PT2PT_HDR_LOCK_REQ, - ompi_comm_rank(module->p2p_comm), - OMPI_SUCCESS); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_pt2pt_passive_unlock(ompi_osc_pt2pt_module_t *module, - int32_t origin, - int32_t count) -{ - ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, origin ); - ompi_osc_pt2pt_pending_lock_t *new_pending = NULL; - - assert(module->p2p_lock_status != 0); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: received unlock request from %d with %d requests\n", - ompi_comm_rank(module->p2p_comm), - origin, count)); - - new_pending = OBJ_NEW(ompi_osc_pt2pt_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = 0; - OPAL_THREAD_LOCK(&(module->p2p_lock)); - module->p2p_num_pending_in += count; - opal_list_append(&module->p2p_unlocks_pending, &(new_pending->super)); - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - return ompi_osc_pt2pt_passive_unlock_complete(module); -} - - -int -ompi_osc_pt2pt_passive_unlock_complete(ompi_osc_pt2pt_module_t *module) -{ - ompi_osc_pt2pt_pending_lock_t *new_pending = NULL; - opal_list_t copy_unlock_acks; - - if (module->p2p_num_pending_in != 0) return OMPI_SUCCESS; - - OPAL_THREAD_LOCK(&(module->p2p_lock)); - if (module->p2p_num_pending_in != 0) { - OPAL_THREAD_UNLOCK(&module->p2p_lock); - return OMPI_SUCCESS; - } - - if (module->p2p_lock_status == MPI_LOCK_EXCLUSIVE) { - ompi_win_remove_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); - module->p2p_lock_status = 0; - } else { - module->p2p_shared_count -= opal_list_get_size(&module->p2p_unlocks_pending); - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: decrementing shared count to %d", - ompi_comm_rank(module->p2p_comm), - module->p2p_shared_count)); - if (module->p2p_shared_count == 0) { - ompi_win_remove_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); - module->p2p_lock_status = 0; - } - } - - OBJ_CONSTRUCT(©_unlock_acks, opal_list_t); - /* copy over any unlocks that have been satisfied (possibly - multiple if SHARED) */ - opal_list_join(©_unlock_acks, - opal_list_get_end(©_unlock_acks), - &module->p2p_unlocks_pending); - OPAL_THREAD_UNLOCK(&module->p2p_lock); - - /* issue whichever unlock acks we should issue */ - while (NULL != (new_pending = (ompi_osc_pt2pt_pending_lock_t*) - opal_list_remove_first(©_unlock_acks))) { - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: sending unlock ack to proc %d", - ompi_comm_rank(module->p2p_comm), - new_pending->proc->proc_name.vpid)); - ompi_osc_pt2pt_control_send(module, - new_pending->proc, - OMPI_OSC_PT2PT_HDR_UNLOCK_REPLY, - OMPI_SUCCESS, OMPI_SUCCESS); - OBJ_RELEASE(new_pending); - } - - OBJ_DESTRUCT(©_unlock_acks); - - /* if we were really unlocked, see if we have another lock request - we can satisfy */ - OPAL_THREAD_LOCK(&module->p2p_lock); - if (0 == module->p2p_lock_status) { - new_pending = (ompi_osc_pt2pt_pending_lock_t*) - opal_list_remove_first(&(module->p2p_locks_pending)); - if (NULL != new_pending) { - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d: sending lock ack to proc %d", - ompi_comm_rank(module->p2p_comm), - new_pending->proc->proc_name.vpid)); - ompi_win_append_mode(module->p2p_win, OMPI_WIN_EXPOSE_EPOCH); - /* set lock state and generate a lock request */ - module->p2p_lock_status = new_pending->lock_type; - if (MPI_LOCK_SHARED == new_pending->lock_type) { - module->p2p_shared_count++; - } - } - } else { - new_pending = NULL; - } - OPAL_THREAD_UNLOCK(&(module->p2p_lock)); - - if (NULL != new_pending) { - ompi_osc_pt2pt_control_send(module, - new_pending->proc, - OMPI_OSC_PT2PT_HDR_LOCK_REQ, - ompi_comm_rank(module->p2p_comm), - OMPI_SUCCESS); - OBJ_RELEASE(new_pending); - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/osc/rdma/Makefile.am b/ompi/mca/osc/rdma/Makefile.am index f79ed2fc78..42131ca639 100644 --- a/ompi/mca/osc/rdma/Makefile.am +++ b/ompi/mca/osc/rdma/Makefile.am @@ -15,22 +15,21 @@ # $HEADER$ # -pt2pt_sources = \ +rdma_sources = \ osc_rdma.h \ osc_rdma.c \ osc_rdma_comm.c \ osc_rdma_component.c \ osc_rdma_data_move.h \ osc_rdma_data_move.c \ + osc_rdma_frag.h \ + osc_rdma_frag.c \ osc_rdma_header.h \ - osc_rdma_longreq.h \ - osc_rdma_longreq.c \ osc_rdma_obj_convert.h \ - osc_rdma_replyreq.h \ - osc_rdma_replyreq.c \ - osc_rdma_sendreq.h \ - osc_rdma_sendreq.c \ - osc_rdma_sync.c + osc_rdma_request.h \ + osc_rdma_request.c \ + osc_rdma_active_target.c \ + osc_rdma_passive_target.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la @@ -46,9 +45,9 @@ endif mcacomponentdir = $(ompilibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_osc_rdma_la_SOURCES = $(pt2pt_sources) +mca_osc_rdma_la_SOURCES = $(rdma_sources) mca_osc_rdma_la_LDFLAGS = -module -avoid-version noinst_LTLIBRARIES = $(component_noinst) -libmca_osc_rdma_la_SOURCES = $(pt2pt_sources) +libmca_osc_rdma_la_SOURCES = $(rdma_sources) libmca_osc_rdma_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/osc/rdma/osc_rdma.c b/ompi/mca/osc/rdma/osc_rdma.c index 45d962d43f..cb5f462588 100644 --- a/ompi/mca/osc/rdma/osc_rdma.c +++ b/ompi/mca/osc/rdma/osc_rdma.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. @@ -7,8 +8,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,7 +21,6 @@ #include "ompi_config.h" #include "osc_rdma.h" -#include "osc_rdma_sendreq.h" #include "opal/threads/mutex.h" #include "ompi/win/win.h" @@ -30,108 +31,81 @@ int -ompi_osc_rdma_module_free(ompi_win_t *win) +ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len) { - int ret = OMPI_SUCCESS; - int i; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - opal_output_verbose(1, ompi_osc_base_framework.framework_output, - "rdma component destroying window with id %d", - ompi_comm_get_cid(module->m_comm)); - - /* finish with a barrier */ - if (ompi_group_size(win->w_group) > 1) { - ret = module->m_comm->c_coll.coll_barrier(module->m_comm, - module->m_comm->c_coll.coll_barrier_module); - } - - /* remove from component information */ - OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); - opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.c_modules, - ompi_comm_get_cid(module->m_comm)); - OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); - - win->w_osc_module = NULL; - - OBJ_DESTRUCT(&module->m_unlocks_pending); - OBJ_DESTRUCT(&module->m_locks_pending); - OBJ_DESTRUCT(&module->m_queued_sendreqs); - OBJ_DESTRUCT(&module->m_copy_pending_sendreqs); - OBJ_DESTRUCT(&module->m_pending_sendreqs); - OBJ_DESTRUCT(&module->m_acc_lock); - OBJ_DESTRUCT(&module->m_cond); - OBJ_DESTRUCT(&module->m_lock); - - if (NULL != module->m_sc_remote_ranks) { - free(module->m_sc_remote_ranks); - } - if (NULL != module->m_sc_remote_active_ranks) { - free(module->m_sc_remote_active_ranks); - } - if (NULL != module->m_pending_buffers) { - free(module->m_pending_buffers); - } - if (NULL != module->m_fence_coll_counts) { - free(module->m_fence_coll_counts); - } - if (NULL != module->m_copy_num_pending_sendreqs) { - free(module->m_copy_num_pending_sendreqs); - } - if (NULL != module->m_num_pending_sendreqs) { - free(module->m_num_pending_sendreqs); - } - if (NULL != module->m_peer_info) { - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]); - } - free(module->m_peer_info); - } - if (NULL != module->m_comm) ompi_comm_free(&module->m_comm); - if (NULL != module) free(module); - - return ret; + return OMPI_SUCCESS; } int -ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info) +ompi_osc_rdma_detach(struct ompi_win_t *win, void *base) { - int i; - - if (NULL != peer_info->peer_btls) { - free(peer_info->peer_btls); - } - - if (NULL != peer_info->local_descriptors) { - for (i = 0 ; i < peer_info->local_num_btls ; ++i) { - if (NULL != peer_info->local_descriptors[i]) { - mca_bml_base_btl_t *bml_btl = peer_info->local_btls[i]; - mca_btl_base_module_t* btl = bml_btl->btl; - - btl->btl_free(btl, peer_info->local_descriptors[i]); - } - } - free(peer_info->local_descriptors); - } - - if (NULL != peer_info->local_registrations) { - for (i = 0 ; i < peer_info->local_num_btls ; ++i) { - if (NULL != peer_info->local_registrations[i]) { - mca_mpool_base_module_t *module = - peer_info->local_registrations[i]->mpool; - module->mpool_deregister(module, - peer_info->local_registrations[i]); - } - } - free(peer_info->local_registrations); - } - - if (NULL != peer_info->local_btls) { - free(peer_info->local_btls); - } - - memset(peer_info, 0, sizeof(ompi_osc_rdma_peer_info_t)); - return OMPI_SUCCESS; } + + +int +ompi_osc_rdma_free(ompi_win_t *win) +{ + int ret = OMPI_SUCCESS; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + opal_list_item_t *item; + + assert (NULL != module); + + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "rdma component destroying window with id %d", + ompi_comm_get_cid(module->comm)); + + /* finish with a barrier */ + if (ompi_group_size(win->w_group) > 1) { + ret = module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + } + + /* remove from component information */ + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules, + ompi_comm_get_cid(module->comm)); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); + + win->w_osc_module = NULL; + + OBJ_DESTRUCT(&module->outstanding_locks); + OBJ_DESTRUCT(&module->locks_pending); + OBJ_DESTRUCT(&module->acc_lock); + OBJ_DESTRUCT(&module->cond); + OBJ_DESTRUCT(&module->lock); + + /* it is erroneous to close a window with active operations on it so we should + * probably produce an error here instead of cleaning up */ + while (NULL != (item = opal_list_remove_first (&module->pending_acc))) { + OBJ_RELEASE(item); + } + + OBJ_DESTRUCT(&module->pending_acc); + + osc_rdma_request_gc_clean (module); + assert (0 == opal_list_get_size (&module->request_gc)); + OBJ_DESTRUCT(&module->request_gc); + + if (NULL != module->peers) { + free(module->peers); + } + if (NULL != module->passive_eager_send_active) free(module->passive_eager_send_active); + if (NULL != module->passive_incoming_frag_count) free(module->passive_incoming_frag_count); + if (NULL != module->passive_incoming_frag_signal_count) free(module->passive_incoming_frag_signal_count); + if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count); + if (NULL != module->incomming_buffer) free (module->incomming_buffer); + if (NULL != module->comm) ompi_comm_free(&module->comm); + if (NULL != module->free_after) free(module->free_after); + + if (NULL != module->frag_request) { + module->frag_request->req_complete_cb = NULL; + ompi_request_cancel (module->frag_request); + } + + free (module); + + return ret; +} diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index dc74600445..d316cd7d7f 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -1,19 +1,21 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -28,234 +30,194 @@ #include "ompi/win/win.h" #include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" #include "ompi/request/request.h" #include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/bml/bml.h" +#include "ompi/memchecker.h" + +#include "osc_rdma_header.h" BEGIN_C_DECLS -struct ompi_osc_rdma_buffer_t { - mca_btl_base_descriptor_t* descriptor; - size_t remain_len; - mca_bml_base_btl_t *bml_btl; -}; -typedef struct ompi_osc_rdma_buffer_t ompi_osc_rdma_buffer_t; +struct ompi_osc_rdma_frag_t; struct ompi_osc_rdma_component_t { /** Extend the basic osc component interface */ ompi_osc_base_component_t super; - /** lock access to datastructures in the component structure */ - opal_mutex_t c_lock; + /** lock access to modules */ + opal_mutex_t lock; - /** List of ompi_osc_rdma_module_ts currently in existance. - Needed so that received fragments can be dispatched to the - correct module */ - opal_hash_table_t c_modules; + /** cid -> module mapping */ + opal_hash_table_t modules; - /** Lock for request management */ - opal_mutex_t c_request_lock; + /** module count */ + int module_count; - /** Condition variable for request management */ - opal_condition_t c_request_cond; - - /** free list of ompi_osc_rdma_sendreq_t structures */ - opal_free_list_t c_sendreqs; - /** free list of ompi_osc_rdma_replyreq_t structures */ - opal_free_list_t c_replyreqs; - /** free list of ompi_osc_rdma_longreq_t structures */ - opal_free_list_t c_longreqs; + /** free list of ompi_osc_rdma_frag_t structures */ + opal_free_list_t frags; - bool c_btl_registered; + /** Free list of requests */ + ompi_free_list_t requests; - uint32_t c_sequence_number; + /** RDMA component buffer size */ + unsigned int buffer_size; + + /** List of operations that need to be processed */ + opal_list_t pending_operations; + + /** Is the progress function enabled? */ + bool progress_enable; }; typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t; -struct ompi_osc_rdma_btl_t { - uint8_t peer_seg[MCA_BTL_SEG_MAX_SIZE]; - mca_bml_base_btl_t *bml_btl; - int rdma_order; - int32_t num_sent; +struct ompi_osc_rdma_peer_t { + /** Pointer to the current send fragment for each outgoing target */ + struct ompi_osc_rdma_frag_t *active_frag; + + /** Number of acks pending. New requests can not be sent out if there are + * acks pending (to fulfill the ordering constraints of accumulate) */ + uint32_t num_acks_pending; }; -typedef struct ompi_osc_rdma_btl_t ompi_osc_rdma_btl_t; - - -struct ompi_osc_rdma_peer_info_t { - uint64_t peer_base; - uint64_t peer_len; - - int peer_num_btls; - volatile int peer_index_btls; - ompi_osc_rdma_btl_t *peer_btls; - - int local_num_btls; - mca_bml_base_btl_t **local_btls; - mca_mpool_base_registration_t **local_registrations; - mca_btl_base_descriptor_t **local_descriptors; -}; -typedef struct ompi_osc_rdma_peer_info_t ompi_osc_rdma_peer_info_t; - - -struct ompi_osc_rdma_setup_info_t { - volatile int32_t num_btls_callin; - int32_t num_btls_expected; - volatile int32_t num_btls_outgoing; - opal_list_t *outstanding_btl_requests; -}; -typedef struct ompi_osc_rdma_setup_info_t ompi_osc_rdma_setup_info_t; +typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t; +#define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL +/** Module structure. Exactly one of these is associated with each + RDMA window */ struct ompi_osc_rdma_module_t { /** Extend the basic osc module interface */ ompi_osc_base_module_t super; - uint32_t m_sequence_number; + /** window should have accumulate ordering... */ + bool accumulate_ordering; - /** lock access to data structures in the current module */ - opal_mutex_t m_lock; + /** pointer to free on cleanup (may be NULL) */ + void *free_after; - /** condition variable for access to current module */ - opal_condition_t m_cond; + /** Base pointer for local window */ + void *baseptr; - /** lock for "atomic" window updates from reductions */ - opal_mutex_t m_acc_lock; + /** communicator created with this window. This is the cid used + in the component's modules mapping. */ + ompi_communicator_t *comm; - /** pointer back to window */ - ompi_win_t *m_win; + /** Local displacement unit. */ + int disp_unit; - /** communicator created with this window */ - ompi_communicator_t *m_comm; + /** Mutex lock protecting module data */ + opal_mutex_t lock; - /** list of ompi_osc_rdma_sendreq_t structures, and includes all - requests for this access epoch that have not already been - started. m_lock must be held when modifying this field. */ - opal_list_t m_pending_sendreqs; + /** condition variable associated with lock */ + opal_condition_t cond; - /** list of unsigned int counters for the number of requests to a - particular rank in m_comm for this access epoc. m_lock - must be held when modifying this field */ - unsigned int *m_num_pending_sendreqs; + /** lock for atomic window updates from reductions */ + opal_mutex_t acc_lock; - /** For MPI_Fence synchronization, the number of messages to send - in epoch. For Start/Complete, the number of updates for this - Complete. For lock, the number of - messages waiting for completion on on the origin side. Not - protected by m_lock - must use atomic counter operations. */ - int32_t m_num_pending_out; + /** peer data */ + ompi_osc_rdma_peer_t *peers; - /** For MPI_Fence synchronization, the number of expected incoming - messages. For Post/Wait, the number of expected updates from - complete. For lock, the number of messages on the passive side - we are waiting for. Not protected by m_lock - must use - atomic counter operations. */ - int32_t m_num_pending_in; + /** Nmber of communication fragments started for this epoch, by + peer. Not in peer data to make fence more manageable. */ + int32_t *epoch_outgoing_frag_count; + + /** List of full communication buffers queued to be sent. Should + be maintained in order (at least in per-target order). */ + opal_list_t queued_frags; + + /** cyclic counter for a unique tage for long messages. */ + int tag_counter; + + /* Number of outgoing fragments that have completed since the + begining of time */ + int32_t outgoing_frag_count; + /* Next outgoing fragment count at which we want a signal on cond */ + int32_t outgoing_frag_signal_count; + + /* Number of incoming fragments that have completed since the + begining of time */ + int32_t active_incoming_frag_count; + /* Next incoming buffer count at which we want a signal on cond */ + int32_t active_incoming_frag_signal_count; + + int32_t *passive_incoming_frag_count; + int32_t *passive_incoming_frag_signal_count; + + /* Number of flush ack requests send since beginning of time */ + uint64_t flush_ack_requested_count; + /* Number of flush ack replies received since beginning of + time. cond should be signalled on every flush reply + received. */ + uint64_t flush_ack_received_count; + + /** True if the access epoch is a passive target access epoch */ + bool passive_target_access_epoch; + + /** start sending data eagerly */ + bool active_eager_send_active; + + bool *passive_eager_send_active; + + /* ********************* PWSC data ************************ */ + struct ompi_group_t *pw_group; + struct ompi_group_t *sc_group; /** Number of "ping" messages from the remote post group we've received */ - int32_t m_num_post_msgs; + int32_t num_post_msgs; /** Number of "count" messages from the remote complete group we've received */ - int32_t m_num_complete_msgs; - - /** cyclic counter for a unique tage for long messages. Not - protected by the m_lock - must use create_send_tag() to - create a send tag */ - volatile int32_t m_tag_counter; - - opal_list_t m_copy_pending_sendreqs; - unsigned int *m_copy_num_pending_sendreqs; - - opal_list_t m_queued_sendreqs; - - /** start sending data eagerly */ - bool m_eager_send_active; - bool m_eager_send_ok; - - /* RDMA data */ - bool m_use_rdma; - bool m_rdma_wait_completion; - ompi_osc_rdma_setup_info_t *m_setup_info; - ompi_osc_rdma_peer_info_t *m_peer_info; - int32_t m_rdma_num_pending; - - /*** buffering ***/ - bool m_use_buffers; - ompi_osc_rdma_buffer_t *m_pending_buffers; - - /* ********************* FENCE data ************************ */ - /* an array of ints, each containing the value - 1. */ - int *m_fence_coll_counts; - - /* ********************* PWSC data ************************ */ - struct ompi_group_t *m_pw_group; - struct ompi_group_t *m_sc_group; - bool *m_sc_remote_active_ranks; - int *m_sc_remote_ranks; + int32_t num_complete_msgs; /* ********************* LOCK data ************************ */ - int32_t m_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */ - int32_t m_shared_count; - opal_list_t m_locks_pending; - opal_list_t m_unlocks_pending; - int32_t m_lock_received_ack; + + /** Status of the local window lock. One of 0 (unlocked), + MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */ + int lock_status; + /** number of peers who hold a shared lock on the local window */ + int32_t shared_count; + /** target side list of lock requests we couldn't satisfy yet */ + opal_list_t locks_pending; + + /** origin side list of locks currently outstanding */ + opal_list_t outstanding_locks; + + uint64_t lock_serial_number; + + unsigned char *incomming_buffer; + ompi_request_t *frag_request; + opal_list_t request_gc; + + /* enforce accumulate semantics */ + opal_atomic_lock_t accumulate_lock; + opal_list_t pending_acc; }; typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t; OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component; +struct ompi_osc_rdma_pending_t { + opal_list_item_t super; + ompi_osc_rdma_module_t *module; + int source; + ompi_osc_rdma_header_t header; +}; +typedef struct ompi_osc_rdma_pending_t ompi_osc_rdma_pending_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t); #define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module) -/* - * Component functions - */ -int ompi_osc_rdma_component_init(bool enable_progress_threads, - bool enable_mpi_threads); +int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len); +int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base); -int ompi_osc_rdma_component_finalize(void); +int ompi_osc_rdma_free(struct ompi_win_t *win); -int ompi_osc_rdma_component_query(struct ompi_win_t *win, - struct ompi_info_t *info, - struct ompi_communicator_t *comm); - -int ompi_osc_rdma_component_select(struct ompi_win_t *win, - struct ompi_info_t *info, - struct ompi_communicator_t *comm); - -/* helper function that properly sets up request handling */ -int ompi_osc_rdma_component_irecv(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *data); - -int ompi_osc_rdma_component_isend(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int dest, - int tag, - struct ompi_communicator_t *comm, - struct ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *data); - -int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info); - -/* - * Module interface function types - */ -int ompi_osc_rdma_module_free(struct ompi_win_t *win); - -int ompi_osc_rdma_module_put(void *origin_addr, +int ompi_osc_rdma_put(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -264,7 +226,7 @@ int ompi_osc_rdma_module_put(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); -int ompi_osc_rdma_module_accumulate(void *origin_addr, +int ompi_osc_rdma_accumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -274,7 +236,7 @@ int ompi_osc_rdma_module_accumulate(void *origin_addr, struct ompi_op_t *op, struct ompi_win_t *win); -int ompi_osc_rdma_module_get(void *origin_addr, +int ompi_osc_rdma_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, @@ -283,43 +245,418 @@ int ompi_osc_rdma_module_get(void *origin_addr, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); -int ompi_osc_rdma_module_fence(int assert, struct ompi_win_t *win); +int ompi_osc_rdma_compare_and_swap(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win); -int ompi_osc_rdma_module_start(struct ompi_group_t *group, - int assert, +int ompi_osc_rdma_fetch_and_op(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, struct ompi_win_t *win); -int ompi_osc_rdma_module_complete(struct ompi_win_t *win); -int ompi_osc_rdma_module_post(struct ompi_group_t *group, +int ompi_osc_rdma_get_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_rdma_rput(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_rdma_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_rdma_raccumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_rdma_rget_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_rdma_fence(int assert, struct ompi_win_t *win); + +/* received a post message */ +int osc_rdma_incomming_post (ompi_osc_rdma_module_t *module); + +int ompi_osc_rdma_start(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win); +int ompi_osc_rdma_complete(struct ompi_win_t *win); + +int ompi_osc_rdma_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); -int ompi_osc_rdma_module_wait(struct ompi_win_t *win); +int ompi_osc_rdma_wait(struct ompi_win_t *win); -int ompi_osc_rdma_module_test(struct ompi_win_t *win, +int ompi_osc_rdma_test(struct ompi_win_t *win, int *flag); -int ompi_osc_rdma_module_lock(int lock_type, +int ompi_osc_rdma_lock(int lock_type, int target, int assert, struct ompi_win_t *win); -int ompi_osc_rdma_module_unlock(int target, +int ompi_osc_rdma_unlock(int target, struct ompi_win_t *win); -/* - * passive side sync interface functions +int ompi_osc_rdma_lock_all(int assert, + struct ompi_win_t *win); + +int ompi_osc_rdma_unlock_all(struct ompi_win_t *win); + +int ompi_osc_rdma_sync(struct ompi_win_t *win); + +int ompi_osc_rdma_flush(int target, + struct ompi_win_t *win); +int ompi_osc_rdma_flush_all(struct ompi_win_t *win); +int ompi_osc_rdma_flush_local(int target, + struct ompi_win_t *win); +int ompi_osc_rdma_flush_local_all(struct ompi_win_t *win); + +int ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info); +int ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used); + +int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module, + void *buf, + size_t count, + struct ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t *comm); + +int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module, + void *buf, + size_t count, + struct ompi_datatype_t *datatype, + int dest, + int tag, + struct ompi_communicator_t *comm); + +/** + * ompi_osc_rdma_progress_pending_acc: + * + * @short Progress one pending accumulation or compare and swap operation. + * + * @param[in] module - OSC RDMA module + * + * @long If the accumulation lock can be aquired progress one pending + * accumulate or compare and swap operation. */ -int ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module, - int32_t origin, - int32_t lock_type); +int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module); -int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module, - int32_t origin, - int32_t count); -int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module); +/** + * mark_incoming_completion: + * + * @short Increment incoming completeion count. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Passive target source or MPI_PROC_NULL (active target) + * + * @long This function incremements either the passive or active incoming counts. + * If the count reaches the signal count we signal the module's condition. + * This function uses atomics if necessary so it is not necessary to hold + * the module lock before calling this function. + */ +static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int source) +{ + if (MPI_PROC_NULL == source) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "mark_incoming_completion marking active incomming complete. count = %d", + (int) module->active_incoming_frag_count + 1)); + OPAL_THREAD_ADD32(&module->active_incoming_frag_count, 1); + if (module->active_incoming_frag_count >= module->active_incoming_frag_signal_count) { + opal_condition_broadcast(&module->cond); + } + } else { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "mark_incoming_completion marking passive incomming complete. source = %d, count = %d", + source, (int) module->passive_incoming_frag_count[source] + 1)); + OPAL_THREAD_ADD32(module->passive_incoming_frag_count + source, 1); + if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) { + opal_condition_broadcast(&module->cond); + } + } +} +/** + * mark_outgoing_completion: + * + * @short Increment outgoing count. + * + * @param[in] module - OSC RDMA module + * + * @long This function is used to signal that an outgoing send is complete. It + * incrememnts only the outgoing fragment count and signals the module + * condition the fragment count is >= the signal count. This function + * uses atomics if necessary so it is not necessary to hold the module + * lock before calling this function. + */ +static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module) +{ + OPAL_THREAD_ADD32(&module->outgoing_frag_count, 1); + if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) { + opal_condition_broadcast(&module->cond); + } +} + +/** + * ompi_osc_signal_outgoing: + * + * @short Increment outgoing signal counters. + * + * @param[in] module - OSC RDMA module + * @param[in] target - Passive target rank or MPI_PROC_NULL (active target) + * @param[in] count - Number of outgoing messages to signal. + * + * @long This function uses atomics if necessary so it is not necessary to hold + * the module lock before calling this function. + */ +static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count) +{ + OPAL_THREAD_ADD32(&module->outgoing_frag_signal_count, count); + if (MPI_PROC_NULL != target) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_signal_outgoing_passive: target = %d, count = %d, total = %d", target, + count, module->epoch_outgoing_frag_count[target] + count)); + OPAL_THREAD_ADD32(module->epoch_outgoing_frag_count + target, count); + } +} + +/** + * osc_rdma_copy_on_recv: + * + * @short Helper function. Copies data from source to target through the + * convertor. + * + * @param[in] target - destination for the data + * @param[in] source - packed data + * @param[in] source_len - length of source buffer + * @param[in] proc - proc that packed the source data + * @param[in] count - count of datatype items + * @param[in] datatype - datatype used for unpacking + * + * @long This functions unpacks data from the source buffer into the target + * buffer. The copy is done with a convertor generated from proc, + * datatype, and count. + */ +static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc, + int count, ompi_datatype_t *datatype) +{ + opal_convertor_t convertor; + uint32_t iov_count = 1; + struct iovec iov; + size_t max_data; + + /* create convertor */ + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + + /* initialize convertor */ + opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, &datatype->super, count, target, + 0, &convertor); + + iov.iov_len = source_len; + iov.iov_base = (IOVBASE_TYPE *) source; + max_data = iov.iov_len; + MEMCHECKER(memchecker_convertor_call(&opal_memchecker_base_mem_defined, &convertor)); + + opal_convertor_unpack (&convertor, &iov, &iov_count, &max_data); + + MEMCHECKER(memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, &convertor)); + + OBJ_DESTRUCT(&convertor); +} + +/** + * osc_rdma_copy_for_send: + * + * @short: Helper function. Copies data from source to target through the + * convertor. + * + * @param[in] target - destination for the packed data + * @param[in] target_len - length of the target buffer + * @param[in] source - original data + * @param[in] proc - proc this data will be sent to + * @param[in] count - count of datatype items + * @param[in] datatype - datatype used for packing + * + * @long This functions packs data from the source buffer into the target + * buffer. The copy is done with a convertor generated from proc, + * datatype, and count. + */ +static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc, + int count, ompi_datatype_t *datatype) +{ + opal_convertor_t convertor; + uint32_t iov_count = 1; + struct iovec iov; + size_t max_data; + + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + + opal_convertor_copy_and_prepare_for_send(proc->proc_convertor, &datatype->super, + count, source, 0, &convertor); + + iov.iov_len = target_len; + iov.iov_base = (IOVBASE_TYPE *) target; + opal_convertor_pack(&convertor, &iov, &iov_count, &max_data); + + OBJ_DESTRUCT(&convertor); +} + +/** + * osc_rdma_request_gc_clean: + * + * @short Release finished PML requests. + * + * @param[in] module - OSC RDMA module + * + * @long This function exists because it is not possible to free a PML request + * from a request completion callback. We instead put the request on the + * module's garbage collection list and release it at a later time. + */ +static inline void osc_rdma_request_gc_clean (ompi_osc_rdma_module_t *module) +{ + ompi_request_t *request; + + while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&module->request_gc))) { + ompi_request_free (&request); + } +} + +#define OSC_RDMA_FRAG_TAG 0x10000 +#define OSC_RDMA_FRAG_MASK 0x0ffff + +/** + * get_tag: + * + * @short Get a send/recv tag for large memory operations. + * + * @param[in] module - OSC RDMA module + * + * @long This function aquires a 16-bit tag for use with large memory operations. The + * tag will be odd or even depending on if this is in a passive target access + * or not. + */ +static inline int get_tag(ompi_osc_rdma_module_t *module) +{ + /* the LSB of the tag is used be the receiver to determine if the + message is a passive or active target (ie, where to mark + completion). */ + int tmp = module->tag_counter + !!(module->passive_target_access_epoch); + + module->tag_counter = (module->tag_counter + 2) & OSC_RDMA_FRAG_MASK; + + return tmp; +} + +/** + * ompi_osc_rdma_accumulate_lock: + * + * @short Internal function that spins until the accumulation lock has + * been aquired. + * + * @param[in] module - OSC RDMA module + * + * @returns 0 + * + * @long This functions blocks until the accumulation lock has been aquired. This + * behavior is only acceptable from a user-level call as blocking in a + * callback may cause deadlock. If a callback needs the accumulate lock and + * it is not available it should be placed on the pending_acc list of the + * module. It will be released by ompi_osc_rdma_accumulate_unlock(). + */ +static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module) +{ + while (opal_atomic_trylock (&module->accumulate_lock)) { + opal_progress (); + } + + return 0; +} + +/** + * ompi_osc_rdma_accumulate_trylock: + * + * @short Try to aquire the accumulation lock. + * + * @param[in] module - OSC RDMA module + * + * @returns 0 if the accumulation lock was aquired + * @returns 1 if the lock was not available + * + * @long This function will try to aquire the accumulation lock. This function + * is safe to call from a callback. + */ +static inline int ompi_osc_rdma_accumulate_trylock (ompi_osc_rdma_module_t *module) +{ + return opal_atomic_trylock (&module->accumulate_lock); +} + +/** + * ompi_osc_rdma_accumulate_unlock: + * + * @short Unlock the accumulation lock and release a pending accumulation operation. + * + * @param[in] module - OSC RDMA module + * + * @long This function unlocks the accumulation lock and release a single pending + * accumulation operation if one exists. This function may be called recursively. + */ +static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *module) +{ + opal_atomic_unlock (&module->accumulate_lock); + if (0 != opal_list_get_size (&module->pending_acc)) { + ompi_osc_rdma_progress_pending_acc (module); + } +} END_C_DECLS diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c new file mode 100644 index 0000000000..5bc7f880a0 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "osc_rdma.h" +#include "osc_rdma_header.h" +#include "osc_rdma_data_move.h" +#include "osc_rdma_frag.h" + +#include "mpi.h" +#include "opal/runtime/opal_progress.h" +#include "opal/threads/mutex.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/osc/base/base.h" + +static int* +get_comm_ranks(ompi_osc_rdma_module_t *module, + ompi_group_t *sub_group) +{ + int *ranks1 = NULL, *ranks2 = NULL; + bool success = false; + int i, ret; + + ranks1 = malloc(sizeof(int) * ompi_group_size(sub_group)); + if (NULL == ranks1) goto cleanup; + ranks2 = malloc(sizeof(int) * ompi_group_size(sub_group)); + if (NULL == ranks2) goto cleanup; + + for (i = 0 ; i < ompi_group_size(sub_group) ; ++i) { + ranks1[i] = i; + } + + ret = ompi_group_translate_ranks(sub_group, + ompi_group_size(sub_group), + ranks1, + module->comm->c_local_group, + ranks2); + if (OMPI_SUCCESS != ret) goto cleanup; + + success = true; + + cleanup: + if (NULL != ranks1) free(ranks1); + if (!success) { + if (NULL != ranks2) free(ranks2); + ranks2 = NULL; + } + + return ranks2; +} + +int +ompi_osc_rdma_fence(int assert, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + uint32_t incoming_reqs; + int ret = OMPI_SUCCESS; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: fence start")); + + /* short-circuit the noprecede case */ + if (0 != (assert & MPI_MODE_NOPRECEDE)) { + ret = module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: fence end (short circuit)")); + return ret; + } + + /* try to start all the requests. */ + ret = ompi_osc_rdma_frag_flush_all(module); + if (OMPI_SUCCESS != ret) goto cleanup; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: fence done sending")); + + /* find out how much data everyone is going to send us. */ + ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count, + &incoming_reqs, 1, MPI_UINT32_T, + MPI_SUM, module->comm, + module->comm->c_coll.coll_reduce_scatter_block_module); + if (OMPI_SUCCESS != ret) goto cleanup; + + OPAL_THREAD_LOCK(&module->lock); + + bzero(module->epoch_outgoing_frag_count, + sizeof(uint32_t) * ompi_comm_size(module->comm)); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: fence expects %d requests", + incoming_reqs)); + + /* set our complete condition for incoming requests */ + module->active_incoming_frag_signal_count += incoming_reqs; + + /* wait for completion */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count || + module->active_incoming_frag_count < module->active_incoming_frag_signal_count) { + opal_condition_wait(&module->cond, &module->lock); + } + + ret = OMPI_SUCCESS; + + if (0 == (assert & MPI_MODE_NOSUCCEED)) { + module->active_eager_send_active = true; + } + + cleanup: + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: fence end: %d", ret)); + + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int +ompi_osc_rdma_start(ompi_group_t *group, + int assert, + ompi_win_t *win) +{ + int ret = OMPI_SUCCESS; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_start entering...")); + + /* save the group */ + OBJ_RETAIN(group); + ompi_group_increment_proc_count(group); + + OPAL_THREAD_LOCK(&module->lock); + + /* ensure we're not already in a start */ + if (NULL != module->sc_group) { + ret = MPI_ERR_RMA_SYNC; + goto cleanup; + } + module->sc_group = group; + + /* disable eager sends until we've receved the proper number of + post messages, at which time we know all our peers are ready to + receive messages. */ + module->active_eager_send_active = false; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "num_post_msgs = %d", module->num_post_msgs)); + + /* possible we've already received a couple in messages, so + add however many we're going to wait for */ + module->num_post_msgs -= ompi_group_size(module->sc_group); + + /* if we've already received all the post messages, we can eager + send. Otherwise, eager send will be enabled when + numb_post_messages reaches 0 */ + if (0 == module->num_post_msgs) { + module->active_eager_send_active = true; + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_start complete")); + + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_SUCCESS; + + cleanup: + OPAL_THREAD_UNLOCK(&module->lock); + ompi_group_decrement_proc_count(group); + OBJ_RELEASE(group); + + return ret; +} + + +int +ompi_osc_rdma_complete(ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_header_complete_t complete_req; + int ret = OMPI_SUCCESS; + int i; + int *ranks = NULL; + ompi_group_t *group; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_complete entering...")); + + ranks = get_comm_ranks(module, module->sc_group); + if (NULL == ranks) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + OPAL_THREAD_LOCK(&module->lock); + + /* wait for all the post messages */ + while (0 != module->num_post_msgs) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); + opal_condition_wait(&module->cond, &module->lock); + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_complete sending complete message")); + + /* for each process in group, send a control message with number + of updates coming, then start all the requests. Note that the + control send is processed as another message in a fragment, so + this might get queued until the flush_all (which is fine). + + At the same time, clean out the outgoing count for the next + round. */ + OPAL_THREAD_UNLOCK(&module->lock); + for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) { + complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE; + complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]]; + module->epoch_outgoing_frag_count[ranks[i]] = 0; + + ret = ompi_osc_rdma_control_send(module, + ranks[i], + &complete_req, + sizeof(ompi_osc_rdma_header_complete_t)); + if (OMPI_SUCCESS != ret) goto cleanup; + } + OPAL_THREAD_LOCK(&module->lock); + + /* start all requests */ + ret = ompi_osc_rdma_frag_flush_all(module); + if (OMPI_SUCCESS != ret) goto cleanup; + + /* wait for outgoing requests to complete. Don't wait for incoming, as + we're only completing the access epoch, not the exposure epoch */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count) { + opal_condition_wait(&module->cond, &module->lock); + } + + /* phase 1 cleanup group */ + group = module->sc_group; + module->sc_group = NULL; + + /* unlock here, as group cleanup can take a while... */ + OPAL_THREAD_UNLOCK(&(module->lock)); + + /* phase 2 cleanup group */ + ompi_group_decrement_proc_count(group); + OBJ_RELEASE(group); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_complete complete")); + free (ranks); + + return OMPI_SUCCESS; + + cleanup: + if (NULL != ranks) free(ranks); + + OPAL_THREAD_UNLOCK(&(module->lock)); + + return ret; +} + + +int +ompi_osc_rdma_post(ompi_group_t *group, + int assert, + ompi_win_t *win) +{ + int *ranks; + int ret = OMPI_SUCCESS; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_header_post_t post_req; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_post entering...")); + + /* save the group */ + OBJ_RETAIN(group); + ompi_group_increment_proc_count(group); + + OPAL_THREAD_LOCK(&(module->lock)); + + /* ensure we're not already in a post */ + if (NULL != module->pw_group) { + OPAL_THREAD_UNLOCK(&(module->lock)); + return MPI_ERR_RMA_SYNC; + } + module->pw_group = group; + + /* Update completion counter. Can't have received any completion + messages yet; complete won't send a completion header until + we've sent a post header. */ + module->num_complete_msgs = -ompi_group_size(module->pw_group); + + OPAL_THREAD_UNLOCK(&(module->lock)); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "sending post messages")); + + ranks = get_comm_ranks(module, module->pw_group); + if (NULL == ranks) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* send a hello counter to everyone in group */ + for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) { + post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST; + post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + post_req.windx = ompi_comm_get_cid(module->comm); + + /* we don't want to send any data, since we're the exposure + epoch only, so use an unbuffered send */ + ret = ompi_osc_rdma_control_send_unbuffered(module, ranks[i], &post_req, + sizeof(ompi_osc_rdma_header_post_t)); + if (OMPI_SUCCESS != ret) { + break; + } + } + + free (ranks); + + return ret; +} + + +int +ompi_osc_rdma_wait(ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_group_t *group; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_wait entering...")); + + OPAL_THREAD_LOCK(&module->lock); + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_wait active_incoming_frag_count = %d, active_incoming_frag_signal_count = %d, num_complete_msgs = %d", + (int) module->active_incoming_frag_count, (int) module->active_incoming_frag_count, module->num_complete_msgs)); + + while (0 != module->num_complete_msgs || + module->active_incoming_frag_count < module->active_incoming_frag_signal_count) { + opal_condition_wait(&module->cond, &module->lock); + } + + group = module->pw_group; + module->pw_group = NULL; + OPAL_THREAD_UNLOCK(&module->lock); + + ompi_group_decrement_proc_count(group); + OBJ_RELEASE(group); + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_wait complete")); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_rdma_test(ompi_win_t *win, + int *flag) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_group_t *group; + int ret = OMPI_SUCCESS; + +#if !OMPI_ENABLE_PROGRESS_THREADS + opal_progress(); +#endif + + OPAL_THREAD_LOCK(&(module->lock)); + + if (0 != module->num_complete_msgs || + module->active_incoming_frag_count < module->active_incoming_frag_signal_count) { + *flag = 0; + ret = OMPI_SUCCESS; + goto cleanup; + } else { + *flag = 1; + + group = module->pw_group; + module->pw_group = NULL; + + OPAL_THREAD_UNLOCK(&(module->lock)); + + ompi_group_decrement_proc_count(group); + OBJ_RELEASE(group); + + return OMPI_SUCCESS; + } + + cleanup: + OPAL_THREAD_UNLOCK(&(module->lock)); + + return ret; +} + diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 3642295cc4..fc1d518439 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -1,19 +1,21 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights - * reserved. + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -21,275 +23,961 @@ #include "mpi.h" #include +#include #include "osc_rdma.h" -#include "osc_rdma_sendreq.h" +#include "osc_rdma_request.h" #include "osc_rdma_header.h" +#include "osc_rdma_frag.h" #include "osc_rdma_data_move.h" -#include "ompi/memchecker.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" + #include "opal_stdint.h" +#include "ompi/memchecker.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "ompi/mca/osc/base/base.h" + +/* progress an OSC request */ +static int ompi_osc_rdma_req_comm_complete (ompi_request_t *request) +{ + ompi_osc_rdma_request_t *rdma_request = (ompi_osc_rdma_request_t *) request->req_complete_cb_data; + ompi_osc_rdma_module_t *module = rdma_request->module; + + OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_req_comm_complete called tag = %d", + request->req_status.MPI_TAG)); + + mark_outgoing_completion (module); + + OPAL_THREAD_LOCK(&ompi_request_lock); + if (0 == --rdma_request->outstanding_requests) { + ompi_osc_rdma_request_complete (rdma_request, request->req_status.MPI_ERROR); + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + /* put this request on the garbage colletion list */ + OPAL_THREAD_LOCK(&module->lock); + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; +} + +/* self communication optimizations */ +static inline int ompi_osc_rdma_put_self (void *source, int source_count, ompi_datatype_t *source_datatype, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) target_disp * module->disp_unit); + int ret; + + ret = ompi_datatype_sndrcv (source, source_count, source_datatype, + target, target_count, target_datatype); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + + return OMPI_SUCCESS; +} + +static inline int ompi_osc_rdma_get_self (void *target, int target_count, ompi_datatype_t *target_datatype, + OPAL_PTRDIFF_TYPE source_disp, int source_count, ompi_datatype_t *source_datatype, + ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) +{ + void *source = (unsigned char*) module->baseptr + + ((unsigned long) source_disp * module->disp_unit); + int ret; + + ret = ompi_datatype_sndrcv (source, source_count, source_datatype, + target, target_count, target_datatype); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + + return OMPI_SUCCESS; +} + +static inline int ompi_osc_rdma_cas_self (void *source, void *compare, void *result, ompi_datatype_t *datatype, + OPAL_PTRDIFF_TYPE target_disp, ompi_osc_rdma_module_t *module) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) target_disp * module->disp_unit); + + ompi_osc_rdma_accumulate_lock (module); + + memcpy (result, target, datatype->super.size); + + if (0 == memcmp (compare, target, datatype->super.size)) { + memcpy (target, source, datatype->super.size); + } + + ompi_osc_rdma_accumulate_unlock (module); + + return OMPI_SUCCESS; +} + +static inline int ompi_osc_rdma_acc_self (void *source, int source_count, ompi_datatype_t *source_datatype, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) target_disp * module->disp_unit); + int ret; + + ompi_osc_rdma_accumulate_lock (module); + + if (&ompi_mpi_op_replace.op != op) { + ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); + } else { + ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); + } + + ompi_osc_rdma_accumulate_unlock (module); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_acc_self: failed performing accumulate operation. ret = %d", ret)); + return ret; + } + + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + + return OMPI_SUCCESS; +} + +static inline int ompi_osc_rdma_gacc_self (void *source, int source_count, ompi_datatype_t *source_datatype, + void *result, int result_count, ompi_datatype_t *result_datatype, + OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, + ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) target_disp * module->disp_unit); + int ret; + + ompi_osc_rdma_accumulate_lock (module); + + do { + ret = ompi_datatype_sndrcv (target, target_count, target_datatype, + result, result_count, result_datatype); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_gacc_self: failed copying to the target buffer. ret = %d", ret)); + break; + } + + if (&ompi_mpi_op_no_op.op != op) { + if (&ompi_mpi_op_replace.op != op) { + ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); + } else { + ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); + } + } + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret)); + break; + } + } while (0); + + ompi_osc_rdma_accumulate_unlock (module); + + if (request) { + /* NTH: is it ok to use an ompi error code here? */ + ompi_osc_rdma_request_complete (request, ret); + } + + return OMPI_SUCCESS; +} +/* end: self communication optimizations */ + +static inline int ompi_osc_rdma_put_w_req (void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + int target_count, struct ompi_datatype_t *target_dt, + ompi_win_t *win, ompi_osc_rdma_request_t *request) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); + ompi_osc_rdma_frag_t *frag; + ompi_osc_rdma_header_put_t *header; + size_t ddt_len, payload_len, frag_len; + bool is_long_msg = false; + const void *packed_ddt; + int tag, ret; + char *ptr; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, win->w_name)); + + /* short-circuit case */ + if (0 == origin_count || 0 == target_count) { + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + + return OMPI_SUCCESS; + } + + /* optimize self communication. TODO: optimize local communication */ + if (ompi_comm_rank (module->comm) == target) { + return ompi_osc_rdma_put_self (origin_addr, origin_count, origin_dt, + target_disp, target_count, target_dt, + module, request); + } + + /* Compute datatype and payload lengths. Note that the datatype description + * must fit in a single buffer */ + ddt_len = ompi_datatype_pack_description_length(target_dt); + payload_len = origin_dt->super.size * origin_count; + frag_len = sizeof(ompi_osc_rdma_header_put_t) + ddt_len + payload_len; + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + frag_len = sizeof(ompi_osc_rdma_header_put_t) + ddt_len; + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + is_long_msg = true; + tag = get_tag(module); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: put long protocol: %d", (int) is_long_msg)); + + header = (ompi_osc_rdma_header_put_t *) ptr; + header->base.flags = 0; + header->len = frag_len; + header->count = target_count; + header->displacement = target_disp; + ptr += sizeof(ompi_osc_rdma_header_put_t); + + ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); + memcpy((unsigned char*) ptr, packed_ddt, ddt_len); + ptr += ddt_len; + + if (!is_long_msg) { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_PUT; + + osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, + origin_dt); + + /* the user's buffer is no longer needed so mark the request as + * complete. */ + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + } else { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG; + + header->tag = tag; + + /* increase the outgoing signal count */ + ompi_osc_signal_outgoing (module, target, 1); + + if (request) { + request->outstanding_requests = 1; + ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_dt, + target, tag, module->comm, ompi_osc_rdma_req_comm_complete, + request); + } else { + ret = ompi_osc_rdma_component_isend (module,origin_addr, origin_count, origin_dt, target, tag, + module->comm); + } + if (OMPI_SUCCESS != ret) goto cleanup; + } + + cleanup: + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; + } + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_finish(module, frag); + + if (request) { + /* need to flush now in case the caller decides to wait on the request */ + ompi_osc_rdma_frag_flush_target (module, target); + } + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +int +ompi_osc_rdma_put(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, ompi_win_t *win) +{ + return ompi_osc_rdma_put_w_req (origin_addr, origin_count, + origin_dt, target, target_disp, + target_count, target_dt, win, NULL); +} + static int -enqueue_sendreq(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq) +ompi_osc_rdma_accumulate_w_req (void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, ompi_win_t *win, + ompi_osc_rdma_request_t *request) { - OPAL_THREAD_LOCK(&(module->m_lock)); - opal_list_append(&(module->m_pending_sendreqs), - (opal_list_item_t*) sendreq); - module->m_num_pending_sendreqs[sendreq->req_target_rank]++; - OPAL_THREAD_UNLOCK(&(module->m_lock)); + int ret; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); + ompi_osc_rdma_frag_t *frag; + ompi_osc_rdma_header_acc_t *header; + size_t ddt_len, payload_len, frag_len; + char *ptr; + bool is_long_msg = false; + const void *packed_ddt; + int tag; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "acc: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, op->o_name, + win->w_name)); + + /* short-circuit case */ + if (0 == origin_count || 0 == target_count) { + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + + return OMPI_SUCCESS; + } + + /* optimize the self case. TODO: optimize the local case */ + if (ompi_comm_rank (module->comm) == target) { + return ompi_osc_rdma_acc_self (origin_addr, origin_count, origin_dt, + target_disp, target_count, target_dt, + op, module, request); + } + + /* Compute datatype and payload lengths. Note that the datatype description + * must fit in a single frag */ + ddt_len = ompi_datatype_pack_description_length(target_dt); + payload_len = origin_dt->super.size * origin_count; + + OPAL_THREAD_LOCK(&module->lock); + + frag_len = sizeof(ompi_osc_rdma_header_acc_t) + ddt_len + payload_len; + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + frag_len = sizeof(ompi_osc_rdma_header_acc_t) + ddt_len; + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "acc: out of resource error while trying to allocate a fragment")); + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + is_long_msg = true; + tag = get_tag (module); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + header = (ompi_osc_rdma_header_acc_t*) ptr; + header->base.flags = 0; + header->len = frag_len; + header->count = target_count; + header->displacement = target_disp; + header->op = op->o_f_to_c_index; + ptr += sizeof(ompi_osc_rdma_header_acc_t); + + ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); + memcpy((unsigned char*) ptr, packed_ddt, ddt_len); + ptr += ddt_len; + + if (!is_long_msg) { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_ACC; + + osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, + origin_count, origin_dt); + + /* the user's buffer is no longer needed so mark the request as + * complete. */ + if (request) { + ompi_osc_rdma_request_complete (request, MPI_SUCCESS); + } + } else { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG; + + header->tag = tag; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "acc: starting long accumulate with tag %d", tag)); + + /* increment the outgoing send count */ + ompi_osc_signal_outgoing (module, target, 1); + + if (request) { + request->outstanding_requests = 1; + ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_dt, + target, tag, module->comm, ompi_osc_rdma_req_comm_complete, + request); + } else { + ret = ompi_osc_rdma_component_isend (module, origin_addr, origin_count, origin_dt, target, tag, + module->comm); + } + + if (OMPI_SUCCESS != ret) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "acc: send failed with eror %d", ret)); + } + } + + /* mark the fragment as valid */ + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; + } + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_finish(module, frag); + + if (request) { + /* need to flush now in case the caller decides to wait on the request */ + ompi_osc_rdma_frag_flush_target (module, target); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +int +ompi_osc_rdma_accumulate(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, ompi_win_t *win) +{ + return ompi_osc_rdma_accumulate_w_req (origin_addr, origin_count, origin_dt, + target, target_disp, target_count, + target_dt, op, win, NULL); +} + +int ompi_osc_rdma_compare_and_swap (void *origin_addr, void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); + ompi_osc_rdma_frag_t *frag; + ompi_osc_rdma_header_cswap_t *header; + size_t ddt_len, payload_len, frag_len; + ompi_osc_rdma_request_t *request; + const void *packed_ddt; + int ret, tag; + char *ptr; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s", + (unsigned long) origin_addr, (unsigned long) compare_addr, + (unsigned long) result_addr, dt->name, target, (int) target_disp, + win->w_name)); + + /* optimize self case. TODO: optimize local case */ + if (ompi_comm_rank (module->comm) == target) { + return ompi_osc_rdma_cas_self (origin_addr, compare_addr, result_addr, dt, target_disp, + module); + } + + /* compare-and-swaps are always request based, so that we know where to land the data */ + OMPI_OSC_RDMA_REQUEST_ALLOC(win, request); + if (NULL == request) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + request->type = OMPI_OSC_RDMA_HDR_TYPE_CSWAP; + request->origin_addr = origin_addr; + request->internal = true; + OBJ_RETAIN(dt); + request->origin_dt = dt; + + /* Compute datatype and payload lengths. Note that the datatype description + * must fit in a single frag. It should be small in this case. */ + ddt_len = ompi_datatype_pack_description_length(dt); + + /* we need to send both the origin and compare buffers */ + payload_len = dt->super.size * 2; + + OPAL_THREAD_LOCK(&module->lock); + + frag_len = sizeof(ompi_osc_rdma_header_cswap_t) + ddt_len + payload_len; + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + tag = get_tag (module); + ompi_osc_signal_outgoing (module, target, 1); + + header = (ompi_osc_rdma_header_cswap_t *) ptr; + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_CSWAP; + header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + header->len = frag_len; + header->displacement = target_disp; + header->tag = tag; + ptr += sizeof(ompi_osc_rdma_header_cswap_t); + + ret = ompi_datatype_get_pack_description(dt, &packed_ddt); + memcpy((unsigned char*) ptr, packed_ddt, ddt_len); + ptr += ddt_len; + + /* pack the origin and compare data */ + osc_rdma_copy_for_send (ptr, dt->super.size, origin_addr, proc, 1, dt); + ptr += dt->super.size; + osc_rdma_copy_for_send (ptr, dt->super.size, compare_addr, proc, 1, dt); + + request->outstanding_requests = 1; + ret = ompi_osc_rdma_irecv_w_cb (result_addr, 1, dt, target, tag, module->comm, + NULL, ompi_osc_rdma_req_comm_complete, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_THREAD_UNLOCK(&module->lock); + return ret; + } + + ret = ompi_osc_rdma_frag_finish(module, frag); + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_fetch_and_op(void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, + OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, + struct ompi_win_t *win) +{ + return ompi_osc_rdma_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt, + target, target_disp, 1, dt, op, win); +} + +int ompi_osc_rdma_rput(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, struct ompi_request_t **request) +{ + ompi_osc_rdma_request_t *rdma_request; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rput: 0x%lx, %d, %s, %d, %d, %d, %s, %s", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, win->w_name)); + + OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); + if (NULL == rdma_request) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* short-circuit case */ + if (0 == origin_count || 0 == target_count) { + ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); + *request = &rdma_request->super; + return OMPI_SUCCESS; + } + + rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_PUT; + + ret = ompi_osc_rdma_put_w_req (origin_addr, origin_count, origin_dt, target, + target_disp, target_count, target_dt, win, + rdma_request); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request); + return ret; + } + + *request = (ompi_request_t *) rdma_request; + + return OMPI_SUCCESS; +} + +static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, bool release_req, + struct ompi_request_t **request) +{ + int ret, tag; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_frag_t *frag; + ompi_osc_rdma_header_get_t *header; + size_t ddt_len, frag_len; + char *ptr; + const void *packed_ddt; + ompi_osc_rdma_request_t *rdma_request; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, win->w_name)); + + /* gets are always request based, so that we know where to land the data */ + OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); + if (NULL == rdma_request) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + rdma_request->internal = release_req; + + /* short-circuit case */ + if (0 == origin_count || 0 == target_count) { + ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); + *request = &rdma_request->super; + return OMPI_SUCCESS; + } + + /* optimize self communication. TODO: optimize local communication */ + if (ompi_comm_rank (module->comm) == target) { + *request = &rdma_request->super; + return ompi_osc_rdma_get_self (origin_addr, origin_count, origin_dt, + target_disp, target_count, target_dt, + module, rdma_request); + } + + rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET; + rdma_request->origin_addr = origin_addr; + rdma_request->origin_count = origin_count; + OBJ_RETAIN(origin_dt); + rdma_request->origin_dt = origin_dt; + + /* Compute datatype length. Note that the datatype description + * must fit in a single frag */ + ddt_len = ompi_datatype_pack_description_length(target_dt); + + OPAL_THREAD_LOCK(&module->lock); + + frag_len = sizeof(ompi_osc_rdma_header_get_t) + ddt_len; + ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + tag = get_tag (module); + + /* for bookkeeping the get is "outgoing" */ + ompi_osc_signal_outgoing (module, target, 1); + + OPAL_THREAD_UNLOCK(&module->lock); + + header = (ompi_osc_rdma_header_get_t*) ptr; + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET; + header->base.flags = 0; + header->len = frag_len; + header->count = target_count; + header->displacement = target_disp; + header->tag = tag; + ptr += sizeof(ompi_osc_rdma_header_get_t); + + ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); + memcpy((unsigned char*) ptr, packed_ddt, ddt_len); + ptr += ddt_len; + + if (OMPI_SUCCESS == ret) { + header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; + *request = &rdma_request->super; + } + + /* TODO -- store the request somewhere so we can cancel it on error */ + rdma_request->outstanding_requests = 1; + ret = ompi_osc_rdma_irecv_w_cb (origin_addr, origin_count, origin_dt, target, tag, + module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request); + + OPAL_THREAD_LOCK(&module->lock); + ret = ompi_osc_rdma_frag_finish(module, frag); + + if (!release_req) { + /* need to flush now in case the caller decides to wait on the request */ + ompi_osc_rdma_frag_flush_target (module, target); + } + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win, + struct ompi_request_t **request) +{ + /* NTH: need to check for a passive access epoch and return the appropriate error if nececcesary */ + return ompi_osc_rdma_rget_internal (origin_addr, origin_count, origin_dt, target, target_disp, + target_count, target_dt, win, false, request); +} + + +int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, + int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win) +{ + ompi_request_t *request; + + return ompi_osc_rdma_rget_internal (origin_addr, origin_count, origin_dt, target, target_disp, + target_count, target_dt, win, true, &request); +} + +int ompi_osc_rdma_raccumulate(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, + OPAL_PTRDIFF_TYPE target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **request) +{ + ompi_osc_rdma_request_t *rdma_request; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, op->o_name, + win->w_name)); + + OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); + if (NULL == rdma_request) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* short-circuit case */ + if (0 == origin_count || 0 == target_count) { + ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); + *request = (ompi_request_t *) rdma_request; + return OMPI_SUCCESS; + } + + rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_ACC; + + ret = ompi_osc_rdma_accumulate_w_req (origin_addr, origin_count, origin_dt, target, + target_disp, target_count, target_dt, op, win, + rdma_request); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request); + return ret; + } + + *request = (ompi_request_t *) rdma_request; return OMPI_SUCCESS; } -int -ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, +static inline +int ompi_osc_rdma_rget_accumulate_internal (void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, MPI_Aint target_disp, + int target_count, struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, struct ompi_win_t *win, + bool release_req, struct ompi_request_t **request) +{ + int ret; + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target_rank); + ompi_osc_rdma_frag_t *frag; + ompi_osc_rdma_header_get_acc_t *header; + size_t ddt_len, payload_len, frag_len; + char *ptr; + bool is_long_msg = false; + const void *packed_ddt; + int tag; + ompi_osc_rdma_request_t *rdma_request; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, 0x%x, %d, %d, %s, %s, %s", + (unsigned long) origin_addr, origin_count, origin_datatype->name, + (unsigned long) result_addr, result_count, result_datatype->name, + target_rank, (int) target_disp, target_count, target_datatype->name, + op->o_name, win->w_name)); + + /* get_accumulates are always request based, so that we know where to land the data */ + OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); + if (OPAL_UNLIKELY(NULL == rdma_request)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + rdma_request->internal = release_req; + + /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ + if (0 == result_count || 0 == target_count) { + ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); + *request = &rdma_request->super; + return OMPI_SUCCESS; + } + + /* optimize the self case. TODO: optimize the local case */ + if (ompi_comm_rank (module->comm) == target_rank) { + *request = &rdma_request->super; + return ompi_osc_rdma_gacc_self (origin_addr, origin_count, origin_datatype, + result_addr, result_count, result_datatype, + target_disp, target_count, target_datatype, + op, module, rdma_request); + } + + rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC; + rdma_request->origin_addr = origin_addr; + rdma_request->origin_count = origin_count; + OBJ_RETAIN(origin_datatype); + rdma_request->origin_dt = origin_datatype; + + /* Compute datatype and payload lengths. Note that the datatype description + * must fit in a single frag */ + ddt_len = ompi_datatype_pack_description_length(target_datatype); + + if (&ompi_mpi_op_no_op.op != op) { + payload_len = origin_datatype->super.size * origin_count; + } else { + payload_len = 0; + } + + OPAL_THREAD_LOCK(&module->lock); + + frag_len = sizeof(*header) + ddt_len + payload_len; + ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + frag_len = sizeof(*header) + ddt_len; + ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr); + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_UNLOCK(&module->lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + is_long_msg = true; + } + + tag = get_tag (module); + + /* If this is a long message then we need two completions before the + * request is complete (1 for the send, 1 for the receive) */ + rdma_request->outstanding_requests = 1 + is_long_msg; + + /* increment the number of outgoing fragments */ + ompi_osc_signal_outgoing (module, target_rank, rdma_request->outstanding_requests); + + OPAL_THREAD_UNLOCK(&module->lock); + + header = (ompi_osc_rdma_header_get_acc_t *) ptr; + header->base.flags = 0; + header->len = frag_len; + header->count = target_count; + header->displacement = target_disp; + header->op = op->o_f_to_c_index; + header->tag = tag; + ptr = (char *)(header + 1); + + ret = ompi_datatype_get_pack_description(target_datatype, &packed_ddt); + memcpy((unsigned char*) ptr, packed_ddt, ddt_len); + ptr += ddt_len; + + ret = ompi_osc_rdma_irecv_w_cb (result_addr, result_count, result_datatype, target_rank, tag, + module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto cleanup; + } + + if (!is_long_msg) { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC; + + if (&ompi_mpi_op_no_op.op != op) { + osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, + origin_datatype); + } + } else { + header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG; + + ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_datatype, target_rank, + tag, module->comm, ompi_osc_rdma_req_comm_complete, rdma_request); + if (OMPI_SUCCESS != ret) goto cleanup; + } + + cleanup: + if (OMPI_SUCCESS == ret) { + header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; + *request = (ompi_request_t *) rdma_request; + } + + OPAL_THREAD_LOCK(&module->lock); + ret = ompi_osc_rdma_frag_finish(module, frag); + + if (!release_req) { + /* need to flush now in case the caller decides to wait on the request */ + ompi_osc_rdma_frag_flush_target (module, target_rank); + } + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +int ompi_osc_rdma_get_accumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - struct ompi_op_t *op, ompi_win_t *win) + void *result_addr, int result_count, + struct ompi_datatype_t *result_dt, + int target, MPI_Aint target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win) { - int ret; - ompi_osc_rdma_sendreq_t *sendreq; - ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_request_t *request; - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!module->m_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_rdma_sendreq_alloc_init(OMPI_OSC_RDMA_ACC, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - module, - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - sendreq->req_op_id = op->o_f_to_c_index; - - if (module->m_eager_send_active) { - /* accumulate semantics require send to self, which is bloody - expensive with the extra copies. Put a shortcut in for the - common case. */ - if (target == ompi_comm_rank(sendreq->req_module->m_comm) && - ompi_datatype_is_contiguous_memory_layout(sendreq->req_target_datatype, - sendreq->req_target_count) && - !opal_convertor_need_buffers(&sendreq->req_origin_convertor) && - 0 == OPAL_THREAD_TRYLOCK(&module->m_acc_lock)) { - void *target_buffer = (unsigned char*) module->m_win->w_baseptr + - ((unsigned long) target_disp * - module->m_win->w_disp_unit); - - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = sendreq->req_origin_bytes_packed; - - iov.iov_len = max_data; - iov.iov_base = NULL; - ret = opal_convertor_pack(&sendreq->req_origin_convertor, - &iov, &iov_count, - &max_data); - if (ret < 0) { - OPAL_THREAD_UNLOCK(&module->m_acc_lock); - return OMPI_ERR_FATAL; - } - - ret = ompi_osc_base_process_op(target_buffer, - iov.iov_base, - max_data, - target_dt, - target_count, - op); - /* unlock the window for accumulates */ - OPAL_THREAD_UNLOCK(&module->m_acc_lock); - ompi_osc_rdma_sendreq_free(sendreq); - return ret; - } - - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out += 1; - module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - ret = ompi_osc_rdma_sendreq_send(module, sendreq); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out -= 1; - opal_list_append(&(module->m_pending_sendreqs), - (opal_list_item_t*) sendreq); - OPAL_THREAD_UNLOCK(&module->m_lock); - ret = OMPI_SUCCESS; - } - } else { - /* enqueue sendreq */ - ret = enqueue_sendreq(module, sendreq); - } - - return ret; + return ompi_osc_rdma_rget_accumulate_internal (origin_addr, origin_count, origin_dt, + result_addr, result_count, result_dt, + target, target_disp, target_count, + target_dt, op, win, true, &request); } -int -ompi_osc_rdma_module_get(void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt, - int target, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - ompi_win_t *win) +int ompi_osc_rdma_rget_accumulate(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, + void *result_addr, int result_count, + struct ompi_datatype_t *result_dt, + int target, MPI_Aint target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + ompi_request_t **request) { - int ret; - ompi_osc_rdma_sendreq_t *sendreq; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!module->m_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_rdma_sendreq_alloc_init(OMPI_OSC_RDMA_GET, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - module, - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - if (module->m_eager_send_active) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out += 1; - module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - ret = ompi_osc_rdma_sendreq_send(module, sendreq); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out -= 1; - opal_list_append(&(module->m_pending_sendreqs), - (opal_list_item_t*) sendreq); - OPAL_THREAD_UNLOCK(&module->m_lock); - ret = OMPI_SUCCESS; - } - } else { - /* enqueue sendreq */ - ret = enqueue_sendreq(module, sendreq); - } - - return ret; -} - - -int -ompi_osc_rdma_module_put(void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, ompi_win_t *win) -{ - int ret; - ompi_osc_rdma_sendreq_t *sendreq; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - if ((OMPI_WIN_STARTED & ompi_win_get_mode(win)) && - (!module->m_sc_remote_active_ranks[target])) { - return MPI_ERR_RMA_SYNC; - } - - if (OMPI_WIN_FENCE & ompi_win_get_mode(win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(win, OMPI_WIN_FENCE | OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - - /* shortcut 0 count case */ - if (0 == origin_count || 0 == target_count) { - return OMPI_SUCCESS; - } - - /* create sendreq */ - ret = ompi_osc_rdma_sendreq_alloc_init(OMPI_OSC_RDMA_PUT, - origin_addr, - origin_count, - origin_dt, - target, - target_disp, - target_count, - target_dt, - module, - &sendreq); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - if (OMPI_SUCCESS != ret) return ret; - - if (module->m_eager_send_active) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out += 1; - module->m_num_pending_sendreqs[sendreq->req_target_rank] += 1; - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - ret = ompi_osc_rdma_sendreq_send(module, sendreq); - - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out -= 1; - opal_list_append(&(module->m_pending_sendreqs), - (opal_list_item_t*) sendreq); - OPAL_THREAD_UNLOCK(&module->m_lock); - ret = OMPI_SUCCESS; - } - } else { - /* enqueue sendreq */ - ret = enqueue_sendreq(module, sendreq); - } - - return ret; + return ompi_osc_rdma_rget_accumulate_internal (origin_addr, origin_count, origin_dt, + result_addr, result_count, result_dt, + target, target_disp, target_count, + target_dt, op, win, false, request); } diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 486a9493d8..458e8d3cec 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -9,10 +9,11 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,11 +26,9 @@ #include #include "osc_rdma.h" -#include "osc_rdma_sendreq.h" -#include "osc_rdma_replyreq.h" -#include "osc_rdma_header.h" #include "osc_rdma_data_move.h" -#include "osc_rdma_obj_convert.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_request.h" #include "opal/threads/condition.h" #include "opal/threads/mutex.h" @@ -42,26 +41,28 @@ #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "ompi/mca/btl/btl.h" -#include "ompi/mca/bml/bml.h" -#include "ompi/mca/bml/base/base.h" #include "ompi/mca/pml/pml.h" +static int component_open(void); static int component_register(void); -static void component_fragment_cb(struct mca_btl_base_module_t *btl, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t *descriptor, - void *cbdata); -static int setup_rdma(ompi_osc_rdma_module_t *module); +static int component_init(bool enable_progress_threads, bool enable_mpi_threads); +static int component_finalize(void); +static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor); +static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model); ompi_osc_rdma_component_t mca_osc_rdma_component = { { /* ompi_osc_base_component_t */ { /* ompi_base_component_t */ - OMPI_OSC_BASE_VERSION_2_0_0, + OMPI_OSC_BASE_VERSION_3_0_0, "rdma", OMPI_MAJOR_VERSION, /* MCA component major version */ OMPI_MINOR_VERSION, /* MCA component minor version */ OMPI_RELEASE_VERSION, /* MCA component release version */ - NULL, + component_open, NULL, NULL, component_register @@ -70,35 +71,59 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = { /* The component is not checkpoint ready */ MCA_BASE_METADATA_PARAM_NONE }, - ompi_osc_rdma_component_init, - ompi_osc_rdma_component_query, - ompi_osc_rdma_component_select, - ompi_osc_rdma_component_finalize + component_init, + component_query, + component_select, + component_finalize } }; ompi_osc_rdma_module_t ompi_osc_rdma_module_template = { { - ompi_osc_rdma_module_free, + NULL, /* shared_query */ - ompi_osc_rdma_module_put, - ompi_osc_rdma_module_get, - ompi_osc_rdma_module_accumulate, + ompi_osc_rdma_attach, + ompi_osc_rdma_detach, + ompi_osc_rdma_free, - ompi_osc_rdma_module_fence, + ompi_osc_rdma_put, + ompi_osc_rdma_get, + ompi_osc_rdma_accumulate, + ompi_osc_rdma_compare_and_swap, + ompi_osc_rdma_fetch_and_op, + ompi_osc_rdma_get_accumulate, - ompi_osc_rdma_module_start, - ompi_osc_rdma_module_complete, - ompi_osc_rdma_module_post, - ompi_osc_rdma_module_wait, - ompi_osc_rdma_module_test, + ompi_osc_rdma_rput, + ompi_osc_rdma_rget, + ompi_osc_rdma_raccumulate, + ompi_osc_rdma_rget_accumulate, - ompi_osc_rdma_module_lock, - ompi_osc_rdma_module_unlock, + ompi_osc_rdma_fence, + + ompi_osc_rdma_start, + ompi_osc_rdma_complete, + ompi_osc_rdma_post, + ompi_osc_rdma_wait, + ompi_osc_rdma_test, + + ompi_osc_rdma_lock, + ompi_osc_rdma_unlock, + ompi_osc_rdma_lock_all, + ompi_osc_rdma_unlock_all, + + ompi_osc_rdma_sync, + ompi_osc_rdma_flush, + ompi_osc_rdma_flush_all, + ompi_osc_rdma_flush_local, + ompi_osc_rdma_flush_local_all, + + ompi_osc_rdma_set_info, + ompi_osc_rdma_get_info } }; +bool ompi_osc_rdma_no_locks; /* look up parameters for configuring this window. The code first looks in the info structure passed by the user, then through mca @@ -140,58 +165,17 @@ check_config_value_bool(char *key, ompi_info_t *info) return flag_value[0]; } -static bool ompi_osc_rdma_eager_send; -static bool ompi_osc_rdma_use_buffers; -static bool ompi_osc_rdma_use_rdma; -static bool ompi_osc_rdma_rdma_completion_wait; -static bool ompi_osc_rdma_no_locks; + +static int +component_open(void) +{ + return OMPI_SUCCESS; +} + static int component_register(void) { - ompi_osc_rdma_eager_send = true; - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, - "eager_send", - "Attempt to start data movement during communication " - "call, instead of at synchrnoization time. " - "Info key of same name overrides this value.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_osc_rdma_eager_send); - - ompi_osc_rdma_use_buffers = true; - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, - "use_buffers", - "Coalesce messages during an epoch to reduce " - "network utilization. Info key of same name " - "overrides this value.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_osc_rdma_use_buffers); - - ompi_osc_rdma_use_rdma = false; - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, - "use_rdma", - "Use real RDMA operations to transfer data. " - "Info key of same name overrides this value.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_osc_rdma_use_rdma); - - ompi_osc_rdma_rdma_completion_wait = true; - (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, - "rdma_completion_wait", - "Wait for all completion of rdma events before " - "sending acknowledgment. Info key of same name " - "overrides this value.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &ompi_osc_rdma_rdma_completion_wait); - ompi_osc_rdma_no_locks = false; (void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "no_locks", @@ -203,100 +187,155 @@ component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_osc_rdma_no_locks); + mca_osc_rdma_component.buffer_size = 8192; + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", + "Data transfers smaller than this limit may be coalesced before " + "being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, + NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_rdma_component.buffer_size); + return OMPI_SUCCESS; } -int -ompi_osc_rdma_component_init(bool enable_progress_threads, - bool enable_mpi_threads) +static int component_progress (void) { - if (!mca_bml_base_inited()) return OMPI_ERROR; + ompi_osc_rdma_pending_t *pending, *next; - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_lock, opal_mutex_t); + if (0 == opal_list_get_size (&mca_osc_rdma_component.pending_operations)) { + return 0; + } - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_modules, + /* process one incomming request */ + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_rdma_component.pending_operations, ompi_osc_rdma_pending_t) { + int ret; + + switch (pending->header.base.type) { + case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ: + ret = ompi_osc_rdma_process_flush (pending->module, pending->source, + &pending->header.flush); + break; + case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ: + ret = ompi_osc_rdma_process_unlock (pending->module, pending->source, + &pending->header.unlock); + break; + default: + /* shouldn't happen */ + assert (0); + abort (); + } + + if (OMPI_SUCCESS == ret) { + opal_list_remove_item (&mca_osc_rdma_component.pending_operations, &pending->super); + OBJ_RELEASE(pending); + } + } + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); + + return 1; +} + +static int +component_init(bool enable_progress_threads, + bool enable_mpi_threads) +{ + int ret; + + OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_osc_rdma_component.pending_operations, opal_list_t); + + OBJ_CONSTRUCT(&mca_osc_rdma_component.modules, opal_hash_table_t); - opal_hash_table_init(&mca_osc_rdma_component.c_modules, 2); + opal_hash_table_init(&mca_osc_rdma_component.modules, 2); - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_request_lock, - opal_mutex_t); - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_request_cond, - opal_condition_t); + mca_osc_rdma_component.progress_enable = false; + mca_osc_rdma_component.module_count = 0; - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_sendreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_rdma_component.c_sendreqs, - sizeof(ompi_osc_rdma_sendreq_t), - OBJ_CLASS(ompi_osc_rdma_sendreq_t), - 1, -1, 1); + OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t); + ret = opal_free_list_init(&mca_osc_rdma_component.frags, + sizeof(ompi_osc_rdma_frag_t), + OBJ_CLASS(ompi_osc_rdma_frag_t), + 1, -1, 1); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: ompi_free_list_init failed: %d", + __FILE__, __LINE__, ret); + return ret; + } - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_replyreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_rdma_component.c_replyreqs, - sizeof(ompi_osc_rdma_replyreq_t), - OBJ_CLASS(ompi_osc_rdma_replyreq_t), - 1, -1, 1); + OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, ompi_free_list_t); + ret = ompi_free_list_init(&mca_osc_rdma_component.requests, + sizeof(ompi_osc_rdma_request_t), + OBJ_CLASS(ompi_osc_rdma_request_t), + 0, -1, 32, NULL); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: ompi_free_list_init failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } - OBJ_CONSTRUCT(&mca_osc_rdma_component.c_longreqs, opal_free_list_t); - opal_free_list_init(&mca_osc_rdma_component.c_longreqs, - sizeof(ompi_osc_rdma_longreq_t), - OBJ_CLASS(ompi_osc_rdma_longreq_t), - 1, -1, 1); - - mca_osc_rdma_component.c_btl_registered = false; - - mca_osc_rdma_component.c_sequence_number = 0; - - return OMPI_SUCCESS; + return ret; } int -ompi_osc_rdma_component_finalize(void) +component_finalize(void) { size_t num_modules; + if (mca_osc_rdma_component.progress_enable) { + opal_progress_unregister (component_progress); + } + if (0 != - (num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.c_modules))) { + (num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) { opal_output(ompi_osc_base_framework.framework_output, "WARNING: There were %d Windows created but not freed.", (int) num_modules); } - mca_bml.bml_register(MCA_BTL_TAG_OSC_RDMA, NULL, NULL); - - OBJ_DESTRUCT(&mca_osc_rdma_component.c_longreqs); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_replyreqs); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_sendreqs); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_request_cond); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_request_lock); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_modules); - OBJ_DESTRUCT(&mca_osc_rdma_component.c_lock); + OBJ_DESTRUCT(&mca_osc_rdma_component.frags); + OBJ_DESTRUCT(&mca_osc_rdma_component.modules); + OBJ_DESTRUCT(&mca_osc_rdma_component.lock); + OBJ_DESTRUCT(&mca_osc_rdma_component.requests); + OBJ_DESTRUCT(&mca_osc_rdma_component.pending_operations); return OMPI_SUCCESS; } -int -ompi_osc_rdma_component_query(ompi_win_t *win, - ompi_info_t *info, - ompi_communicator_t *comm) +static int +component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor) { - /* if we inited, then the BMLs are available and we have a path to - each peer. Return slightly higher priority than the - point-to-point code */ + if (MPI_WIN_FLAVOR_SHARED == flavor) return -1; + return 10; } -int -ompi_osc_rdma_component_select(ompi_win_t *win, - ompi_info_t *info, - ompi_communicator_t *comm) +static int +component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model) { ompi_osc_rdma_module_t *module = NULL; - int ret, i; - char *tmp; + int ret; + char *name; + bool no_locks = false; - /* create module structure */ + /* We don't support shared windows; that's for the sm onesided + component */ + if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED; + + if (check_config_value_bool("no_locks", info)) { + no_locks = true; + ompi_osc_rdma_no_locks = true; + } + + /* create module structure with all fields initialized to zero */ module = (ompi_osc_rdma_module_t*) calloc(1, sizeof(ompi_osc_rdma_module_t)); if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; @@ -305,1076 +344,166 @@ ompi_osc_rdma_component_select(ompi_win_t *win, memcpy(module, &ompi_osc_rdma_module_template, sizeof(ompi_osc_base_module_t)); - /* initialize the module part */ - OBJ_CONSTRUCT(&module->m_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->m_cond, opal_condition_t); - OBJ_CONSTRUCT(&module->m_acc_lock, opal_mutex_t); - OBJ_CONSTRUCT(&module->m_pending_sendreqs, opal_list_t); - OBJ_CONSTRUCT(&module->m_copy_pending_sendreqs, opal_list_t); - OBJ_CONSTRUCT(&module->m_queued_sendreqs, opal_list_t); - OBJ_CONSTRUCT(&module->m_locks_pending, opal_list_t); - OBJ_CONSTRUCT(&module->m_unlocks_pending, opal_list_t); + /* initialize the objects, so that always free in cleanup */ + OBJ_CONSTRUCT(&module->lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->cond, opal_condition_t); + OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->queued_frags, opal_list_t); + OBJ_CONSTRUCT(&module->locks_pending, opal_list_t); + OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t); + OBJ_CONSTRUCT(&module->request_gc, opal_list_t); + OBJ_CONSTRUCT(&module->pending_acc, opal_list_t); - module->m_win = win; + /* options */ + /* FIX ME: should actually check this value... */ +#if 1 + module->accumulate_ordering = 1; +#else + ompi_osc_base_config_value_equal("accumulate_ordering", info, "none"); +#endif - OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); - module->m_sequence_number = (mca_osc_rdma_component.c_sequence_number++); - OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); + /* fill in our part */ + if (MPI_WIN_FLAVOR_ALLOCATE == flavor && size) { + module->free_after = *base = malloc(size); + if (NULL == *base) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + } - ret = ompi_comm_dup(comm, &module->m_comm); - if (ret != OMPI_SUCCESS) goto cleanup; + /* in the dynamic case base is MPI_BOTTOM */ + if (MPI_WIN_FLAVOR_DYNAMIC != flavor) { + module->baseptr = *base; + } - opal_output_verbose(1, ompi_osc_base_framework.framework_output, - "rdma component creating window with id %d", - ompi_comm_get_cid(module->m_comm)); + ret = ompi_comm_dup(comm, &module->comm); + if (OMPI_SUCCESS != ret) goto cleanup; - asprintf(&tmp, "%d", ompi_comm_get_cid(module->m_comm)); - ompi_win_set_name(win, tmp); - free(tmp); + OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, + "rdma component creating window with id %d", + ompi_comm_get_cid(module->comm))); - module->m_num_pending_sendreqs = (unsigned int*) - malloc(sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_num_pending_sendreqs) { + /* record my displacement unit. Always resolved at target */ + module->disp_unit = disp_unit; + + /* peer data */ + module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_rdma_peer_t)); + if (NULL == module->peers) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } - memset(module->m_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - module->m_num_pending_out = 0; - module->m_num_pending_in = 0; - module->m_num_post_msgs = 0; - module->m_num_complete_msgs = 0; - module->m_tag_counter = 0; - - module->m_copy_num_pending_sendreqs = (unsigned int*) - malloc(sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_copy_num_pending_sendreqs) { + /* peer op count data */ + module->epoch_outgoing_frag_count = calloc (ompi_comm_size(comm), sizeof(uint32_t)); + if (NULL == module->epoch_outgoing_frag_count) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; } - memset(module->m_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - module->m_eager_send_ok = check_config_value_bool("eager_send", info); + if (!no_locks) { + module->passive_incoming_frag_count = calloc(ompi_comm_size(comm), sizeof(uint32_t)); + if (NULL == module->passive_incoming_frag_count) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + + module->passive_incoming_frag_signal_count = calloc(ompi_comm_size(comm), sizeof(uint32_t)); + if (NULL == module->passive_incoming_frag_signal_count) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } + } + /* initially, we're in that pseudo-fence state, so we allow eager sends (yay for Fence). Other protocols will disable before they start their epochs, so this isn't a problem. */ - module->m_eager_send_active = module->m_eager_send_ok; + module->active_eager_send_active = true; - /* allocate space for rdma information */ - module->m_use_rdma = check_config_value_bool("use_rdma", info); - module->m_rdma_wait_completion = check_config_value_bool("rdma_completion_wait", info); - module->m_setup_info = NULL; - module->m_peer_info = NULL; - - /* buffer setup */ - module->m_use_buffers = check_config_value_bool("use_buffers", info); - module->m_pending_buffers = (ompi_osc_rdma_buffer_t *) malloc(sizeof(ompi_osc_rdma_buffer_t) * - ompi_comm_size(module->m_comm)); - memset(module->m_pending_buffers, 0, - sizeof(ompi_osc_rdma_buffer_t) * ompi_comm_size(module->m_comm)); - - /* fence data */ - module->m_fence_coll_counts = (int*) - malloc(sizeof(int) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_fence_coll_counts) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - module->m_fence_coll_counts[i] = 1; - } - - /* pwsc data */ - module->m_pw_group = NULL; - module->m_sc_group = NULL; - module->m_sc_remote_active_ranks = (bool*) - malloc(sizeof(bool) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_sc_remote_active_ranks) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - module->m_sc_remote_ranks = (int*) - malloc(sizeof(int) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_sc_remote_ranks) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; + if (!no_locks) { + module->passive_eager_send_active = malloc(sizeof(bool) * ompi_comm_size(comm)); + if (NULL == module->passive_eager_send_active) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto cleanup; + } } /* lock data */ - module->m_lock_status = 0; - module->m_shared_count = 0; - module->m_lock_received_ack = 0; - - /* update component data */ - OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); - opal_hash_table_set_value_uint32(&mca_osc_rdma_component.c_modules, - ompi_comm_get_cid(module->m_comm), - module); - OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); - if (OMPI_SUCCESS != ret) goto cleanup; - - /* fill in window information */ - win->w_osc_module = (ompi_osc_base_module_t*) module; if (check_config_value_bool("no_locks", info)) { win->w_flags |= OMPI_WIN_NO_LOCKS; } - /* register to receive fragment callbacks, if not already done */ - OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); - if (!mca_osc_rdma_component.c_btl_registered) { - mca_osc_rdma_component.c_btl_registered = true; - ret = mca_bml.bml_register(MCA_BTL_TAG_OSC_RDMA, - component_fragment_cb, - NULL); - } - OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); + /* update component data */ + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules, + ompi_comm_get_cid(module->comm), + module); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); if (OMPI_SUCCESS != ret) goto cleanup; + /* fill in window information */ + *model = MPI_WIN_UNIFIED; + win->w_osc_module = (ompi_osc_base_module_t*) module; + asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm)); + ompi_win_set_name(win, name); + free(name); + /* sync memory - make sure all initialization completed */ opal_atomic_mb(); - if (module->m_use_rdma) { - /* fill in rdma information - involves barrier semantics */ - ret = setup_rdma(module); - } else { - /* barrier to prevent arrival of lock requests before we're - fully created */ - ret = module->m_comm->c_coll.coll_barrier(module->m_comm, - module->m_comm->c_coll.coll_barrier_module); + module->incomming_buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t)); + if (OPAL_UNLIKELY(NULL == module->incomming_buffer)) { + goto cleanup; } + + ret = ompi_osc_rdma_frag_start_receive (module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto cleanup; + } + + /* barrier to prevent arrival of lock requests before we're + fully created */ + ret = module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); if (OMPI_SUCCESS != ret) goto cleanup; - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "done creating window %d", ompi_comm_get_cid(module->m_comm))); + if (!mca_osc_rdma_component.progress_enable) { + opal_progress_register (component_progress); + mca_osc_rdma_component.progress_enable = true; + } + + OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, + "done creating rdma window %d", ompi_comm_get_cid(module->comm))); return OMPI_SUCCESS; cleanup: - OBJ_DESTRUCT(&module->m_unlocks_pending); - OBJ_DESTRUCT(&module->m_locks_pending); - OBJ_DESTRUCT(&module->m_queued_sendreqs); - OBJ_DESTRUCT(&module->m_copy_pending_sendreqs); - OBJ_DESTRUCT(&module->m_pending_sendreqs); - OBJ_DESTRUCT(&module->m_acc_lock); - OBJ_DESTRUCT(&module->m_cond); - OBJ_DESTRUCT(&module->m_lock); - - if (NULL != module->m_sc_remote_ranks) { - free(module->m_sc_remote_ranks); - } - if (NULL != module->m_sc_remote_active_ranks) { - free(module->m_sc_remote_active_ranks); - } - if (NULL != module->m_fence_coll_counts) { - free(module->m_fence_coll_counts); - } - if (NULL != module->m_copy_num_pending_sendreqs) { - free(module->m_copy_num_pending_sendreqs); - } - if (NULL != module->m_num_pending_sendreqs) { - free(module->m_num_pending_sendreqs); - } - if (NULL != module->m_peer_info) { - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - ompi_osc_rdma_peer_info_free(&module->m_peer_info[i]); - } - free(module->m_peer_info); - } - if (NULL != module->m_comm) ompi_comm_free(&module->m_comm); - if (NULL != module) free(module); + ompi_osc_rdma_free (win); return ret; } -/* dispatch for callback on message completion */ -static void -component_fragment_cb(struct mca_btl_base_module_t *btl, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t *descriptor, - void *cbdata) -{ - int ret; - ompi_osc_rdma_module_t *module; - void *payload; - bool done = false; - ompi_osc_rdma_base_header_t *base_header = - (ompi_osc_rdma_base_header_t*) descriptor->des_dst[0].seg_addr.pval; - - assert(descriptor->des_dst[0].seg_len >= - sizeof(ompi_osc_rdma_base_header_t)); - - /* handle message */ - while (!done) { - switch (base_header->hdr_type) { - case OMPI_OSC_RDMA_HDR_PUT: - { - ompi_osc_rdma_send_header_t *header; - - /* get our header and payload */ - header = (ompi_osc_rdma_send_header_t*) base_header; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_SEND_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - if (!ompi_win_exposure_epoch(module->m_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->m_win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(module->m_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - ret = ompi_osc_rdma_sendreq_recv_put(module, header, &payload); - } - break; - - case OMPI_OSC_RDMA_HDR_ACC: - { - ompi_osc_rdma_send_header_t *header; - - /* get our header and payload */ - header = (ompi_osc_rdma_send_header_t*) base_header; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_SEND_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - if (!ompi_win_exposure_epoch(module->m_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->m_win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(module->m_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - /* receive into temporary buffer */ - ret = ompi_osc_rdma_sendreq_recv_accum(module, header, &payload); - } - break; - - case OMPI_OSC_RDMA_HDR_GET: - { - ompi_datatype_t *datatype; - ompi_osc_rdma_send_header_t *header; - ompi_osc_rdma_replyreq_t *replyreq; - ompi_proc_t *proc; - - /* get our header and payload */ - header = (ompi_osc_rdma_send_header_t*) base_header; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_SEND_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - if (!ompi_win_exposure_epoch(module->m_win)) { - if (OMPI_WIN_FENCE & ompi_win_get_mode(module->m_win)) { - /* well, we're definitely in an access epoch now */ - ompi_win_set_mode(module->m_win, - OMPI_WIN_FENCE | - OMPI_WIN_ACCESS_EPOCH | - OMPI_WIN_EXPOSE_EPOCH); - } - } - - /* create or get a pointer to our datatype */ - proc = ompi_comm_peer_lookup( module->m_comm, header->hdr_origin ); - datatype = ompi_osc_base_datatype_create(proc, &payload); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->m_comm, 1, false); - } - - /* create replyreq sendreq */ - ret = ompi_osc_rdma_replyreq_alloc_init(module, - header->hdr_origin, - header->hdr_origin_sendreq, - header->hdr_target_disp, - header->hdr_target_count, - datatype, - &replyreq); - - /* send replyreq */ - ompi_osc_rdma_replyreq_send(module, replyreq); - - /* sendreq does the right retain, so we can release safely */ - OBJ_RELEASE(datatype); - } - break; - - case OMPI_OSC_RDMA_HDR_REPLY: - { - ompi_osc_rdma_reply_header_t *header; - ompi_osc_rdma_sendreq_t *sendreq; - - /* get our header and payload */ - header = (ompi_osc_rdma_reply_header_t*) base_header; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_REPLY_HDR_NTOH(*header); - } -#endif - - /* get original sendreq pointer */ - sendreq = (ompi_osc_rdma_sendreq_t*) header->hdr_origin_sendreq.pval; - module = sendreq->req_module; - - /* receive data */ - ompi_osc_rdma_replyreq_recv(module, sendreq, header, &payload); - } - break; - case OMPI_OSC_RDMA_HDR_POST: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - int32_t count; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_num_post_msgs -= 1); - OPAL_THREAD_UNLOCK(&module->m_lock); - if (count == 0) { - module->m_eager_send_active = module->m_eager_send_ok; - - while (module->m_eager_send_active && - opal_list_get_size(&module->m_pending_sendreqs)) { - ompi_osc_rdma_sendreq_t *sendreq; - - OPAL_THREAD_LOCK(&module->m_lock); - sendreq = (ompi_osc_rdma_sendreq_t*) - opal_list_remove_first(&module->m_pending_sendreqs); - - if (NULL == sendreq) { - OPAL_THREAD_UNLOCK(&module->m_lock); - break; - } - - sendreq->req_module->m_num_pending_out += 1; - OPAL_THREAD_UNLOCK(&module->m_lock); - - ret = ompi_osc_rdma_sendreq_send(module, sendreq); - - if (OMPI_SUCCESS != ret) { - OPAL_THREAD_LOCK(&module->m_lock); - sendreq->req_module->m_num_pending_out -= 1; - opal_list_append(&(module->m_pending_sendreqs), - (opal_list_item_t*) sendreq); - OPAL_THREAD_UNLOCK(&module->m_lock); - break; - } - } - - opal_condition_broadcast(&module->m_cond); - } - } - break; - - case OMPI_OSC_RDMA_HDR_COMPLETE: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - int32_t count; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - /* we've heard from one more place, and have value reqs to - process */ - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_num_complete_msgs -= 1); - count += (module->m_num_pending_in += header->hdr_value[0]); - OPAL_THREAD_UNLOCK(&module->m_lock); - - if (count == 0) opal_condition_broadcast(&module->m_cond); - } - break; - - case OMPI_OSC_RDMA_HDR_LOCK_REQ: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - int32_t count; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - if (header->hdr_value[1] > 0) { - ompi_osc_rdma_passive_lock(module, header->hdr_value[0], - header->hdr_value[1]); - } else { - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_lock_received_ack += 1); - OPAL_THREAD_UNLOCK(&module->m_lock); - - if (count != 0) opal_condition_broadcast(&module->m_cond); - } - } - break; - - case OMPI_OSC_RDMA_HDR_UNLOCK_REQ: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - ompi_osc_rdma_passive_unlock(module, header->hdr_value[0], - header->hdr_value[1]); - } - break; - - case OMPI_OSC_RDMA_HDR_UNLOCK_REPLY: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - int32_t count; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&module->m_lock); - if (count == 0) opal_condition_broadcast(&module->m_cond); - } - break; - - case OMPI_OSC_RDMA_HDR_RDMA_COMPLETE: - { - ompi_osc_rdma_control_header_t *header = - (ompi_osc_rdma_control_header_t*) base_header; - int32_t count; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_CONTROL_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_num_pending_in -= header->hdr_value[0]); - OPAL_THREAD_UNLOCK(&module->m_lock); - if (count == 0) opal_condition_broadcast(&module->m_cond); - } - break; - - case OMPI_OSC_RDMA_HDR_RDMA_INFO: - { - ompi_osc_rdma_rdma_info_header_t *header = - (ompi_osc_rdma_rdma_info_header_t*) base_header; - ompi_proc_t *proc = NULL; - mca_bml_base_endpoint_t *endpoint = NULL; - mca_bml_base_btl_t *bml_btl; - ompi_osc_rdma_btl_t *rdma_btl; - int origin, index; - payload = (void*) (header + 1); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(*header); - } -#endif - - /* get our module pointer */ - module = ompi_osc_rdma_windx_to_module(header->hdr_windx); - if (NULL == module) return; - - origin = header->hdr_origin; - - /* find the bml_btl */ - proc = ompi_comm_peer_lookup(module->m_comm, origin); - endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - bml_btl = mca_bml_base_btl_array_find(&endpoint->btl_rdma, btl); - if (NULL == bml_btl) { - opal_output(ompi_osc_base_framework.framework_output, - "received rdma info for unknown btl from rank %d", - origin); - return; - } else { - OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, - "received rdma info from rank %d for BTL %s", - origin, - bml_btl->btl-> - btl_component->btl_version. - mca_component_name)); - } - - OPAL_THREAD_LOCK(&module->m_lock); - index = module->m_peer_info[origin].peer_num_btls++; - rdma_btl = &(module->m_peer_info[origin].peer_btls[index]); - - memmove (rdma_btl->peer_seg, header + 1, bml_btl->btl->btl_seg_size); - - rdma_btl->bml_btl = bml_btl; - rdma_btl->rdma_order = MCA_BTL_NO_ORDER; - rdma_btl->num_sent = 0; - - module->m_setup_info->num_btls_callin++; - OPAL_THREAD_UNLOCK(&module->m_lock); - - opal_condition_broadcast(&module->m_cond); - } - break; - - case OMPI_OSC_RDMA_HDR_MULTI_END: - payload = base_header; - done = true; - break; - - default: - /* BWB - FIX ME - this sucks */ - opal_output(ompi_osc_base_framework.framework_output, - "received packet for Window with unknown type"); - } - - if ((base_header->hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_MULTI) != 0) { - /* The next header starts at the next aligned address in - * the buffer. Therefore, check the hdr_flags to see if - * any extra alignment is necessary, and if so, pull value - * from the flags. */ - if (base_header->hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_ALIGN_MASK) { - payload = (char *)payload + (base_header->hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_ALIGN_MASK); - } - base_header = (ompi_osc_rdma_base_header_t*) payload; - } else { - done = true; - } - } -} - - int -ompi_osc_rdma_component_irecv(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int src, - int tag, - struct ompi_communicator_t *comm, - ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *cbdata) +ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info) { - int ret; - bool missed_callback; - ompi_request_complete_fn_t tmp; + ompi_osc_rdma_module_t *module = + (ompi_osc_rdma_module_t*) win->w_osc_module; - ret = MCA_PML_CALL(irecv(buf, count, datatype, - src, tag, comm, request)); - if (OMPI_SUCCESS != ret) return ret; - - /* lock the giant request mutex to update the callback data so - that the PML can't mark the request as complete while we're - updating the callback data, which means we can - deterministically ensure the callback is only fired once and - that we didn't miss it. */ - OPAL_THREAD_LOCK(&ompi_request_lock); - (*request)->req_complete_cb = callback; - (*request)->req_complete_cb_data = cbdata; - missed_callback = (*request)->req_complete; - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - if (missed_callback) { - tmp = (*request)->req_complete_cb; - (*request)->req_complete_cb = NULL; - tmp(*request); - } - - return OMPI_SUCCESS; + /* enforce collectiveness... */ + return module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); } int -ompi_osc_rdma_component_isend(void *buf, - size_t count, - struct ompi_datatype_t *datatype, - int dest, - int tag, - struct ompi_communicator_t *comm, - ompi_request_t **request, - ompi_request_complete_fn_t callback, - void *cbdata) +ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used) { - int ret; - bool missed_callback; - ompi_request_complete_fn_t tmp; + ompi_info_t *info = OBJ_NEW(ompi_info_t); + if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - ret = MCA_PML_CALL(isend(buf, count, datatype, - dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request)); - if (OMPI_SUCCESS != ret) return ret; - - /* lock the giant request mutex to update the callback data so - that the PML can't mark the request as complete while we're - updating the callback data, which means we can - deterministically ensure the callback is only fired once and - that we didn't miss it. */ - OPAL_THREAD_LOCK(&ompi_request_lock); - (*request)->req_complete_cb = callback; - (*request)->req_complete_cb_data = cbdata; - missed_callback = (*request)->req_complete; - OPAL_THREAD_UNLOCK(&ompi_request_lock); - - if (missed_callback) { - tmp = (*request)->req_complete_cb; - (*request)->req_complete_cb = NULL; - tmp(*request); - } + *info_used = info; return OMPI_SUCCESS; } - -/*********** RDMA setup stuff ***********/ - - -struct peer_rdma_send_info_t{ - opal_list_item_t super; - ompi_osc_rdma_module_t *module; - ompi_proc_t *proc; - mca_bml_base_btl_t *bml_btl; - void *seg; -}; -typedef struct peer_rdma_send_info_t peer_rdma_send_info_t; -OBJ_CLASS_INSTANCE(peer_rdma_send_info_t, opal_list_item_t, NULL, NULL); - - -static void -rdma_send_info_send_complete(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - peer_rdma_send_info_t *peer_send_info = - (peer_rdma_send_info_t*) descriptor->des_cbdata; - - if (OMPI_SUCCESS == status) { - btl->btl_free(btl, descriptor); - - OPAL_THREAD_LOCK(&peer_send_info->module->m_lock); - peer_send_info->module->m_setup_info->num_btls_outgoing--; - OPAL_THREAD_UNLOCK(&peer_send_info->module->m_lock); - - opal_condition_broadcast(&(peer_send_info->module->m_cond)); - - OBJ_RELEASE(peer_send_info); - } else { - /* BWB - fix me */ - abort(); - } -} - -static int -rdma_send_info_send(ompi_osc_rdma_module_t *module, - peer_rdma_send_info_t *peer_send_info) -{ - int ret = OMPI_SUCCESS; - mca_bml_base_btl_t *bml_btl = NULL; - mca_btl_base_descriptor_t *descriptor = NULL; - ompi_osc_rdma_rdma_info_header_t *header = NULL; - size_t hdr_size; - - bml_btl = peer_send_info->bml_btl; - - hdr_size = sizeof(ompi_osc_rdma_rdma_info_header_t) + bml_btl->btl->btl_seg_size; - - mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, hdr_size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); - if (NULL == descriptor) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - /* verify at least enough space for header */ - if (descriptor->des_src[0].seg_len < hdr_size) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - - /* setup descriptor */ - descriptor->des_cbfunc = rdma_send_info_send_complete; - descriptor->des_cbdata = peer_send_info; - descriptor->des_src[0].seg_len = sizeof(ompi_osc_rdma_rdma_info_header_t); - - /* pack header */ - header = (ompi_osc_rdma_rdma_info_header_t*) descriptor->des_src[0].seg_addr.pval; - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_RDMA_INFO; - header->hdr_base.hdr_flags = 0; - header->hdr_origin = ompi_comm_rank(module->m_comm); - header->hdr_windx = ompi_comm_get_cid(module->m_comm); - - memmove (header + 1, peer_send_info->seg, bml_btl->btl->btl_seg_size); - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (peer_send_info->proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(*header); - } -#endif - - /* send fragment */ - ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); - if (1 == ret) ret = OMPI_SUCCESS; - goto done; - - cleanup: - if (descriptor != NULL) { - mca_bml_base_free(bml_btl, descriptor); - } - - done: - return ret; -} - - -static bool -is_valid_rdma(mca_bml_base_btl_t *bml_btl) -{ - if ((bml_btl->btl->btl_put != NULL) && - (bml_btl->btl->btl_get != NULL) && - ((bml_btl->btl_flags & MCA_BTL_FLAGS_RDMA_MATCHED) == 0)) { - return true; - } - - return false; -} - - -static int -setup_rdma(ompi_osc_rdma_module_t *module) -{ - - uint64_t local; - uint64_t *remote = NULL; - MPI_Datatype ui64_type; - int ret = OMPI_SUCCESS; - int i; - -#if SIZEOF_LONG == 8 - ui64_type = MPI_LONG; -#else - ui64_type = MPI_LONG_LONG; -#endif - - /* create a setup info structure */ - module->m_setup_info = (ompi_osc_rdma_setup_info_t *) malloc(sizeof(ompi_osc_rdma_setup_info_t)); - if (NULL == module->m_setup_info) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - module->m_setup_info->num_btls_callin = 0; - module->m_setup_info->num_btls_expected = -1; - module->m_setup_info->num_btls_outgoing = 0; - module->m_setup_info->outstanding_btl_requests = - (opal_list_t *) malloc(sizeof(opal_list_t) * ompi_comm_size(module->m_comm)); - if (NULL == module->m_setup_info->outstanding_btl_requests) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - OBJ_CONSTRUCT(&(module->m_setup_info->outstanding_btl_requests[i]), - opal_list_t); - } - - /* create peer info array */ - module->m_peer_info = (ompi_osc_rdma_peer_info_t*) - malloc(sizeof(ompi_osc_rdma_peer_info_t) * - ompi_comm_size(module->m_comm)); - if (NULL == module->m_peer_info) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - memset(module->m_peer_info, 0, - sizeof(ompi_osc_rdma_peer_info_t) * ompi_comm_size(module->m_comm)); - - /* get number of btls to each peer, descriptors for the window for - each peer */ - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - ompi_proc_t *proc = ompi_comm_peer_lookup(module->m_comm, i); - ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i]; - mca_bml_base_endpoint_t *endpoint = - (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - int num_avail = - mca_bml_base_btl_array_get_size(&endpoint->btl_rdma); - size_t j, size; - opal_convertor_t convertor; - - /* skip peer if heterogeneous */ - if (ompi_proc_local()->proc_arch != proc->proc_arch) { - continue; - } - - /* get a rough estimation of how many BTLs we'll be able to - use, and exit if the answer is none */ - for (j = 0 ; - j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ; - ++j) { - mca_bml_base_btl_t *bml_btl = - mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j); - if (!is_valid_rdma(bml_btl)) num_avail--; - } - if (0 == num_avail) continue; - - /* Allocate space for all the useable BTLs. They might not - all end up useable, if we can't pin memory for the btl or - the like. But the number of elements to start with should - be small and the number that fail the pin test should be - approximately 0, so this isn't too big of a waste */ - peer_info->peer_btls = (ompi_osc_rdma_btl_t*) - malloc(sizeof(ompi_osc_rdma_btl_t) * num_avail); - peer_info->local_btls = (mca_bml_base_btl_t**) - malloc(sizeof(mca_bml_base_btl_t*) * num_avail); - peer_info->local_registrations = (mca_mpool_base_registration_t**) - malloc(sizeof(mca_mpool_base_registration_t*) * num_avail); - peer_info->local_descriptors = (mca_btl_base_descriptor_t**) - malloc(sizeof(mca_btl_base_descriptor_t*) * num_avail); - if (NULL == peer_info->peer_btls || - NULL == peer_info->local_btls || - NULL == peer_info->local_registrations || - NULL == peer_info->local_descriptors) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - memset(peer_info->peer_btls, 0, - sizeof(ompi_osc_rdma_btl_t) * num_avail); - memset(peer_info->local_registrations, 0, - sizeof(mca_mpool_base_registration_t*) * num_avail); - memset(peer_info->local_descriptors, 0, - sizeof(mca_btl_base_descriptor_t*) * num_avail); - - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* Find all useable btls, try to do the descriptor thing for - them, and store all that information */ - for (j = 0 ; - j < mca_bml_base_btl_array_get_size(&endpoint->btl_rdma) ; - ++j) { - mca_bml_base_btl_t *bml_btl = - mca_bml_base_btl_array_get_index(&endpoint->btl_rdma, j); - mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; - int index = peer_info->local_num_btls; - - if (!is_valid_rdma(bml_btl)) continue; - - if (NULL != btl_mpool) { - ret = btl_mpool->mpool_register(btl_mpool, module->m_win->w_baseptr, - module->m_win->w_size, 0, - &(peer_info->local_registrations[index])); - if (OMPI_SUCCESS != ret) continue; - } else { - peer_info->local_registrations[index] = NULL; - } - - size = module->m_win->w_size; - - opal_convertor_copy_and_prepare_for_send(proc->proc_convertor, - &(ompi_mpi_byte.dt.super), - module->m_win->w_size, - module->m_win->w_baseptr, - 0, - &convertor); - - mca_bml_base_prepare_dst(bml_btl, - peer_info->local_registrations[index], - &convertor, MCA_BTL_NO_ORDER, 0, &size, 0, - &peer_info->local_descriptors[index]); - - if (NULL == peer_info->local_descriptors[index]) { - if (NULL != peer_info->local_registrations[index]) { - btl_mpool->mpool_deregister(btl_mpool, - peer_info->local_registrations[index]); - } - opal_convertor_cleanup(&convertor); - continue; - } - - peer_info->local_btls[index] = bml_btl; - - opal_convertor_cleanup(&convertor); - - peer_info->local_num_btls++; - module->m_setup_info->num_btls_outgoing++; - } - - OBJ_DESTRUCT(&convertor); - } - - /* fill in information about remote peers */ - remote = (uint64_t *) malloc(sizeof(uint64_t) * ompi_comm_size(module->m_comm)); - if (NULL == remote) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - local = ompi_ptr_ptol(module->m_win->w_baseptr); - ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type, - remote, 1, ui64_type, - module->m_comm, - module->m_comm->c_coll.coll_allgather_module); - if (OMPI_SUCCESS != ret) goto cleanup; - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - module->m_peer_info[i].peer_base = remote[i]; - } - - local = module->m_win->w_size; - ret = module->m_comm->c_coll.coll_allgather(&local, 1, ui64_type, - remote, 1, ui64_type, - module->m_comm, - module->m_comm->c_coll.coll_allgather_module); - if (OMPI_SUCCESS != ret) goto cleanup; - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - module->m_peer_info[i].peer_len = remote[i]; - } - - /* get number of btls we're expecting from everyone */ - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - remote[i] = module->m_peer_info[i].local_num_btls; - } - ret = module->m_comm->c_coll.coll_reduce_scatter(remote, - &local, - module->m_fence_coll_counts, - ui64_type, - MPI_SUM, - module->m_comm, - module->m_comm->c_coll.coll_reduce_scatter_module); - if (OMPI_SUCCESS != ret) goto cleanup; - module->m_setup_info->num_btls_expected = (int32_t)local; - /* end fill in information about remote peers */ - - /* send our contact info to everyone... */ - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - ompi_osc_rdma_peer_info_t *peer_info = &module->m_peer_info[i]; - int j; - - for (j = 0 ; j < peer_info->local_num_btls ; ++j) { - peer_rdma_send_info_t *peer_send_info = - OBJ_NEW(peer_rdma_send_info_t); - peer_send_info->module = module; - peer_send_info->proc = ompi_comm_peer_lookup(module->m_comm, i); - peer_send_info->bml_btl = peer_info->local_btls[j]; - peer_send_info->seg = (void *) peer_info->local_descriptors[j]->des_dst; - - ret = rdma_send_info_send(module, peer_send_info); - if (OMPI_SUCCESS != ret) { - opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]), - &peer_send_info->super); - } - } - } - - OPAL_THREAD_LOCK(&module->m_lock); - while ((module->m_setup_info->num_btls_outgoing != 0) || - (module->m_setup_info->num_btls_expected != - module->m_setup_info->num_btls_callin)) { - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - peer_rdma_send_info_t *peer_send_info = - (peer_rdma_send_info_t*) opal_list_remove_first(&module->m_setup_info->outstanding_btl_requests[i]); - if (NULL != peer_send_info) { - ret = rdma_send_info_send(module, peer_send_info); - if (OMPI_SUCCESS != ret) { - opal_list_append(&(module->m_setup_info->outstanding_btl_requests[i]), - &peer_send_info->super); - } - } - } - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - - ret = OMPI_SUCCESS; - - cleanup: - if (NULL != module->m_setup_info) { - if (NULL != module->m_setup_info->outstanding_btl_requests) { - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - OBJ_DESTRUCT(&(module->m_setup_info->outstanding_btl_requests[i])); - } - free(module->m_setup_info->outstanding_btl_requests); - } - free(module->m_setup_info); - } - if (NULL != remote) free(remote); - - return ret; -} - +OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_t, opal_list_item_t, NULL, NULL); diff --git a/ompi/mca/osc/rdma/osc_rdma_data_move.c b/ompi/mca/osc/rdma/osc_rdma_data_move.c index 6ffef7ba2b..1e2c137490 100644 --- a/ompi/mca/osc/rdma/osc_rdma_data_move.c +++ b/ompi/mca/osc/rdma/osc_rdma_data_move.c @@ -8,9 +8,10 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009-2011 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,18 +22,19 @@ #include "ompi_config.h" #include "osc_rdma.h" -#include "osc_rdma_sendreq.h" #include "osc_rdma_header.h" #include "osc_rdma_data_move.h" #include "osc_rdma_obj_convert.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_request.h" +#include "opal/threads/condition.h" +#include "opal/threads/mutex.h" #include "opal/util/arch.h" #include "opal/util/output.h" #include "opal/sys/atomic.h" #include "opal/align.h" #include "ompi/mca/pml/pml.h" -#include "ompi/mca/bml/bml.h" -#include "ompi/mca/bml/base/base.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" @@ -40,1131 +42,440 @@ #include "ompi/op/op.h" #include "ompi/memchecker.h" -static inline int32_t -create_send_tag(ompi_osc_rdma_module_t *module) +/** + * struct osc_rdma_accumulate_data_t: + * + * @short Data associated with an in-progress accumulation operation. + */ +struct osc_rdma_accumulate_data_t { + ompi_osc_rdma_module_t* module; + void *target; + void *source; + size_t source_len; + ompi_proc_t *proc; + int count; + ompi_datatype_t *datatype; + ompi_op_t *op; + int request_count; +}; + + +/** + * osc_rdma_pending_acc_t: + * + * @short Keep track of accumulate and cswap operations that are + * waiting on the accumulate lock. + * + * @long Since accumulate operations may take several steps to + * complete we need to lock the accumulate lock until the operation + * is complete. While the lock is held it is possible that additional + * accumulate operations will arrive. This structure keep track of + * those operations. + */ +struct osc_rdma_pending_acc_t { + opal_list_item_t super; + ompi_osc_rdma_header_t header; + int source; + void *data; + size_t data_len; + ompi_datatype_t *datatype; +}; +typedef struct osc_rdma_pending_acc_t osc_rdma_pending_acc_t; + +static void osc_rdma_pending_acc_constructor (osc_rdma_pending_acc_t *pending) { -#if OPAL_ENABLE_MULTI_THREADS && OPAL_HAVE_ATOMIC_CMPSET_32 - int32_t newval, oldval; + pending->data = NULL; + pending->datatype = NULL; +} + +static void osc_rdma_pending_acc_destructor (osc_rdma_pending_acc_t *pending) +{ + if (NULL != pending->data) { + free (pending->data); + } + + if (NULL != pending->datatype) { + OBJ_RELEASE(pending->datatype); + } +} + +OBJ_CLASS_DECLARATION(osc_rdma_pending_acc_t); +OBJ_CLASS_INSTANCE(osc_rdma_pending_acc_t, opal_list_item_t, + osc_rdma_pending_acc_constructor, osc_rdma_pending_acc_destructor); +/* end ompi_osc_rdma_pending_acc_t class */ + +/** + * datatype_buffer_length: + * + * @short Determine the buffer size needed to hold count elements of datatype. + * + * @param[in] datatype - Element type + * @param[in] count - Element count + * + * @returns buflen Buffer length needed to hold count elements of datatype + */ +static inline int datatype_buffer_length (ompi_datatype_t *datatype, int count) +{ + ompi_datatype_t *primitive_datatype = NULL; + uint32_t primitive_count; + size_t buflen; + + ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); + primitive_count *= count; + + /* figure out how big a buffer we need */ + ompi_datatype_type_size(primitive_datatype, &buflen); + + return buflen * primitive_count; +} + +/** + * ompi_osc_rdma_control_send: + * + * @short send a control message as part of a fragment + * + * @param[in] module - OSC RDMA module + * @param[in] target - Target peer's rank + * @param[in] data - Data to send + * @param[in] len - Length of data + * + * @returns error OMPI error code or OMPI_SUCCESS + * + * @long "send" a control messages. Adds it to the active fragment, so the + * caller will still need to explicitly flush (either to everyone or + * to a target) before this is sent. + */ +int ompi_osc_rdma_control_send (ompi_osc_rdma_module_t *module, int target, + void *data, size_t len) +{ + ompi_osc_rdma_frag_t *frag; + char *ptr; + int ret; + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_alloc(module, target, len, &frag, &ptr); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + memcpy (ptr, data, len); + + ret = ompi_osc_rdma_frag_finish(module, frag); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +static int ompi_osc_rdma_control_send_unbuffered_cb (ompi_request_t *request) +{ + mca_pml_base_send_request_t *pml_request = (mca_pml_base_send_request_t *) request; + ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data; + /* get the send address from the pml request */ + void *data_copy = pml_request->req_addr; + + /* mark this send as complete */ + mark_outgoing_completion (module); + + /* free the temporary buffer */ + free (data_copy); + + /* put this request on the garbage colletion list */ + OPAL_THREAD_LOCK(&module->lock); + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; +} + +/** + * ompi_osc_rdma_control_send_unbuffered: + * + * @short Send an unbuffered control message to a peer. + * + * @param[in] module - OSC RDMA module + * @param[in] target - Target rank + * @param[in] data - Data to send + * @param[in] len - Length of data + * + * @long Directly send a control message. This does not allocate a + * fragment, so should only be used when sending other messages would + * be erroneous (such as complete messages, when there may be queued + * transactions from an overlapping post that has already heard back + * from its peer). The buffer specified by data will be available + * when this call returns. + */ +int ompi_osc_rdma_control_send_unbuffered(ompi_osc_rdma_module_t *module, + int target, void *data, size_t len) +{ + unsigned char *data_copy; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: sending unbuffered fragment to %d", target)); + + /* allocate a temporary buffer for this send */ + data_copy = malloc (len); + if (OPAL_UNLIKELY(NULL == data_copy)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* increment outgoing signal count. this send is not part of a passive epoch + * so there it would be erroneous to increment the epoch counters. */ + ompi_osc_signal_outgoing (module, MPI_PROC_NULL, 1); + + memcpy (data_copy, data, len); + + return ompi_osc_rdma_isend_w_cb (data_copy, len, MPI_BYTE, target, OSC_RDMA_FRAG_TAG, + module->comm, ompi_osc_rdma_control_send_unbuffered_cb, module); +} + +/** + * datatype_create: + * + * @short Utility function that creates a new datatype from a packed + * description. + * + * @param[in] module - OSC RDMA module + * @param[in] peer - Peer rank + * @param[out] datatype - New datatype. Must be released with OBJ_RELEASE. + * @param[out] proc - Optional. Proc for peer. + * @param[inout] data - Pointer to a pointer where the description is stored. This + * pointer will be updated to the location after the packed + * description. + */ +static inline int datatype_create (ompi_osc_rdma_module_t *module, int peer, ompi_proc_t **proc, ompi_datatype_t **datatype, void **data) +{ + ompi_datatype_t *new_datatype = NULL; + ompi_proc_t *peer_proc; + int ret = OMPI_SUCCESS; + do { - oldval = module->m_tag_counter; - newval = (oldval + 1) % mca_pml.pml_max_tag; - } while (0 == opal_atomic_cmpset_32(&module->m_tag_counter, oldval, newval)); - return newval; -#else - int32_t ret; - /* no compare and swap - have to lock the module */ - OPAL_THREAD_LOCK(&module->m_lock); - module->m_tag_counter = (module->m_tag_counter + 1) % mca_pml.pml_max_tag; - ret = module->m_tag_counter; - OPAL_THREAD_UNLOCK(&module->m_lock); + peer_proc = ompi_comm_peer_lookup(module->comm, peer); + if (OPAL_UNLIKELY(NULL == peer_proc)) { + OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, + "%d: datatype_create: could not resolve proc pointer for peer %d", + ompi_comm_rank(module->comm), + peer)); + ret = OMPI_ERROR; + break; + } + + new_datatype = ompi_osc_base_datatype_create(peer_proc, data); + if (OPAL_UNLIKELY(NULL == new_datatype)) { + OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, + "%d: datatype_create: could not resolve datatype for peer %d", + ompi_comm_rank(module->comm), peer)); + ret = OMPI_ERROR; + } + } while (0); + + *datatype = new_datatype; + if (proc) *proc = peer_proc; + return ret; -#endif } - -static inline void -inmsg_mark_complete(ompi_osc_rdma_module_t *module) -{ - int32_t count; - bool need_unlock = false; - - OPAL_THREAD_LOCK(&module->m_lock); - count = (module->m_num_pending_in -= 1); - if ((0 != module->m_lock_status) && - (opal_list_get_size(&module->m_unlocks_pending) != 0)) { - need_unlock = true; - } - OPAL_THREAD_UNLOCK(&module->m_lock); - - if (0 == count) { - if (need_unlock) ompi_osc_rdma_passive_unlock_complete(module); - opal_condition_broadcast(&module->m_cond); - } -} - -/********************************************************************** +/** + * process_put: * - * Multi-buffer support + * @shoer Process a put w/ data message * - **********************************************************************/ -static int -send_multi_buffer(ompi_osc_rdma_module_t *module, int rank) -{ - ompi_osc_rdma_base_header_t *header = (ompi_osc_rdma_base_header_t*) - ((char*) module->m_pending_buffers[rank].descriptor->des_src[0].seg_addr.pval + - module->m_pending_buffers[rank].descriptor->des_src[0].seg_len); - - header->hdr_type = OMPI_OSC_RDMA_HDR_MULTI_END; - header->hdr_flags = 0; - - module->m_pending_buffers[rank].descriptor->des_src[0].seg_len += - sizeof(ompi_osc_rdma_base_header_t); - mca_bml_base_send(module->m_pending_buffers[rank].bml_btl, - module->m_pending_buffers[rank].descriptor, - MCA_BTL_TAG_OSC_RDMA); - - module->m_pending_buffers[rank].descriptor = NULL; - module->m_pending_buffers[rank].bml_btl = NULL; - module->m_pending_buffers[rank].remain_len = 0; - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_flush(ompi_osc_rdma_module_t *module) -{ - int i; - - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - if (module->m_pending_buffers[i].descriptor != NULL) { - send_multi_buffer(module, i); - } - } - - return OMPI_SUCCESS; -} - - -/********************************************************************** + * @param[in] module - OSC RDMA module + * @param[in] source - Message source + * @param[in] put_header - Message header + data * - * RDMA data transfers (put / get) - * - **********************************************************************/ -static void -rdma_cb(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) + * @long Process a put message and copy the message data to the specified + * memory region. Note, this function does not handle any bounds + * checking at the moment. + */ +static inline int process_put(ompi_osc_rdma_module_t* module, int source, + ompi_osc_rdma_header_put_t* put_header) { - ompi_osc_rdma_sendreq_t *sendreq = - (ompi_osc_rdma_sendreq_t*) descriptor->des_cbdata; - int32_t out_count, rdma_count; - - assert(OMPI_SUCCESS == status); - - OPAL_THREAD_LOCK(&sendreq->req_module->m_lock); - out_count = (sendreq->req_module->m_num_pending_out -= 1); - rdma_count = (sendreq->req_module->m_rdma_num_pending -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->m_lock); - - btl->btl_free(btl, descriptor); - ompi_osc_rdma_sendreq_free(sendreq); - - if ((0 == out_count) || (0 == rdma_count)) { - opal_condition_broadcast(&sendreq->req_module->m_cond); - } -} - - -static int -ompi_osc_rdma_sendreq_rdma(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq) -{ - mca_btl_base_descriptor_t* descriptor; - ompi_osc_rdma_btl_t *rdma_btl = NULL; - mca_btl_base_module_t* btl; - size_t size = sendreq->req_origin_bytes_packed; - int index, target, ret; - - target = sendreq->req_target_rank; - - if (module->m_peer_info[target].peer_num_btls > 0) { - - index = ++(module->m_peer_info[target].peer_index_btls); - if (index >= module->m_peer_info[target].peer_num_btls) { - module->m_peer_info[target].peer_index_btls = 0; - index = 0; - } - - rdma_btl = &(module->m_peer_info[target].peer_btls[index]); - btl = rdma_btl->bml_btl->btl; - - if (sendreq->req_type == OMPI_OSC_RDMA_PUT) { - mca_bml_base_prepare_src(rdma_btl->bml_btl, NULL, - &sendreq->req_origin_convertor, rdma_btl->rdma_order, - 0, &size, 0, &descriptor); - - assert(NULL != descriptor); - - descriptor->des_dst = (mca_btl_base_segment_t *) sendreq->remote_segs; - descriptor->des_dst_cnt = 1; - memmove (descriptor->des_dst, rdma_btl->peer_seg, sizeof (rdma_btl->peer_seg)); - - descriptor->des_dst[0].seg_addr.lval = - module->m_peer_info[target].peer_base + - ((unsigned long)sendreq->req_target_disp * module->m_win->w_disp_unit); - descriptor->des_dst[0].seg_len = - sendreq->req_origin_bytes_packed; -#if 0 - opal_output(0, "putting to %d: 0x%lx(%d), %d, %d", - target, descriptor->des_dst[0].seg_addr.lval, - descriptor->des_dst[0].seg_len, - rdma_btl->rdma_order, - descriptor->order); -#endif - descriptor->des_cbdata = sendreq; - descriptor->des_cbfunc = rdma_cb; - - ret = btl->btl_put(btl, rdma_btl->bml_btl->btl_endpoint, - descriptor); - } else { - mca_bml_base_prepare_dst(rdma_btl->bml_btl, - NULL, &sendreq->req_origin_convertor, rdma_btl->rdma_order, - 0, &size, 0, &descriptor); - - assert(NULL != descriptor); - - descriptor->des_src = (mca_btl_base_segment_t *) sendreq->remote_segs; - descriptor->des_src_cnt = 1; - memmove (descriptor->des_src, rdma_btl->peer_seg, sizeof (rdma_btl->peer_seg)); - - descriptor->des_src[0].seg_addr.lval = - module->m_peer_info[target].peer_base + - ((unsigned long)sendreq->req_target_disp * module->m_win->w_disp_unit); - descriptor->des_src[0].seg_len = - sendreq->req_origin_bytes_packed; - - descriptor->des_cbdata = sendreq; - descriptor->des_cbfunc = rdma_cb; - - ret = btl->btl_get(btl, rdma_btl->bml_btl->btl_endpoint, - descriptor); - } - rdma_btl->rdma_order = descriptor->order; - - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } else { - OPAL_THREAD_LOCK(&module->m_lock); - rdma_btl->num_sent++; - sendreq->req_module->m_rdma_num_pending += 1; - OPAL_THREAD_UNLOCK(&module->m_lock); - } - } else { - return OMPI_ERR_NOT_SUPPORTED; - } - - return OMPI_SUCCESS; -} - - -/********************************************************************** - * - * Sending a sendreq to target - * - **********************************************************************/ -static int -ompi_osc_rdma_sendreq_send_long_cb(ompi_request_t *request) -{ - ompi_osc_rdma_longreq_t *longreq = - (ompi_osc_rdma_longreq_t*) request->req_complete_cb_data; - ompi_osc_rdma_sendreq_t *sendreq = longreq->req_basereq.req_sendreq; - int32_t count; + char *data = (char*) (put_header + 1); + ompi_proc_t *proc; + struct ompi_datatype_t *datatype; + size_t data_len; + void *target = (unsigned char*) module->baseptr + + ((unsigned long) put_header->displacement * module->disp_unit); + int ret; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d completed long sendreq to %d", - ompi_comm_rank(sendreq->req_module->m_comm), - sendreq->req_target_rank)); + "%d: process_put: received message from %d", + ompi_comm_rank(module->comm), + source)); - OPAL_THREAD_LOCK(&sendreq->req_module->m_lock); - count = (sendreq->req_module->m_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->m_lock); + ret = datatype_create (module, source, &proc, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } - ompi_osc_rdma_longreq_free(longreq); - ompi_osc_rdma_sendreq_free(sendreq); + data_len = put_header->len - ((uintptr_t) data - (uintptr_t) put_header); - if (0 == count) opal_condition_broadcast(&sendreq->req_module->m_cond); + osc_rdma_copy_on_recv (target, data, data_len, proc, put_header->count, datatype); - ompi_request_free(&request); - return OMPI_SUCCESS; + OBJ_RELEASE(datatype); + + return put_header->len; } - -static void -ompi_osc_rdma_sendreq_send_cb(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) +static inline int process_put_long(ompi_osc_rdma_module_t* module, int source, + ompi_osc_rdma_header_put_t* put_header) { - ompi_osc_rdma_send_header_t *header = - (ompi_osc_rdma_send_header_t*) descriptor->des_src[0].seg_addr.pval; - ompi_osc_rdma_sendreq_t *sendreq = NULL; - ompi_osc_rdma_module_t *module = NULL; - int32_t count; - bool done = false; + char *data = (char*) (put_header + 1); + struct ompi_datatype_t *datatype; + void *target = (unsigned char*) module->baseptr + + ((unsigned long) put_header->displacement * module->disp_unit); + int ret; - if (OMPI_SUCCESS != status) { - /* requeue and return */ - /* BWB - FIX ME - figure out where to put this bad boy */ - abort(); - return; - } - - if (header->hdr_base.hdr_type == OMPI_OSC_RDMA_HDR_MULTI_END) { - done = true; - } - - while (!done) { - sendreq = (ompi_osc_rdma_sendreq_t*) header->hdr_origin_sendreq.pval; - module = sendreq->req_module; - - /* have to look at header, and not the sendreq because in the - case of get, it's possible that the sendreq has been freed - already (if the remote side replies before we get our send - completion callback) and already allocated to another - request. We don't wait for this completion before exiting - a synchronization point in the case of get, as we really - don't care when it completes - only when the data - arrives. */ - if (OMPI_OSC_RDMA_HDR_GET != header->hdr_base.hdr_type) { -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_SEND_HDR_NTOH(*header); - } -#endif - /* do we need to post a send? */ - if (header->hdr_msg_length != 0) { - /* sendreq is done. Mark it as so and get out of here */ - OPAL_THREAD_LOCK(&sendreq->req_module->m_lock); - count = sendreq->req_module->m_num_pending_out -= 1; - OPAL_THREAD_UNLOCK(&sendreq->req_module->m_lock); - ompi_osc_rdma_sendreq_free(sendreq); - if (0 == count) { - opal_condition_broadcast(&sendreq->req_module->m_cond); - } - } else { - ompi_osc_rdma_longreq_t *longreq; - ompi_osc_rdma_longreq_alloc(&longreq); - - longreq->req_basereq.req_sendreq = sendreq; - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d starting long sendreq to %d (%d)", - ompi_comm_rank(sendreq->req_module->m_comm), - sendreq->req_target_rank, - header->hdr_origin_tag)); - - ompi_osc_rdma_component_isend(sendreq->req_origin_convertor.pBaseBuf, - sendreq->req_origin_convertor.count, - sendreq->req_origin_datatype, - sendreq->req_target_rank, - header->hdr_origin_tag, - sendreq->req_module->m_comm, - &(longreq->request), - ompi_osc_rdma_sendreq_send_long_cb, - longreq); - } - } else { - ompi_osc_rdma_sendreq_free(sendreq); - } - - if (0 == (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_MULTI)) { - done = true; - } else { - /* Find starting point for next header. Note that the last part - * added in to compute the starting point for the next header is - * extra padding that may have been inserted. */ - header = (ompi_osc_rdma_send_header_t*) - (((char*) header) + - sizeof(ompi_osc_rdma_send_header_t) + - ompi_datatype_pack_description_length(sendreq->req_target_datatype) + - header->hdr_msg_length + - (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_ALIGN_MASK)); - - if (header->hdr_base.hdr_type == OMPI_OSC_RDMA_HDR_MULTI_END) { - done = true; - } - } - } - - /* release the descriptor and sendreq */ - btl->btl_free(btl, descriptor); - - if (opal_list_get_size(&module->m_queued_sendreqs) > 0) { - opal_list_item_t *item; - int ret, i, len; - - len = opal_list_get_size(&module->m_queued_sendreqs); - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "%d items in restart queue", - len)); - for (i = 0 ; i < len ; ++i) { - OPAL_THREAD_LOCK(&module->m_lock); - item = opal_list_remove_first(&module->m_queued_sendreqs); - OPAL_THREAD_UNLOCK(&module->m_lock); - if (NULL == item) break; - - ret = ompi_osc_rdma_sendreq_send(module, (ompi_osc_rdma_sendreq_t*) item); - if (OMPI_SUCCESS != ret) { - OPAL_THREAD_LOCK(&module->m_lock); - opal_list_append(&(module->m_queued_sendreqs), item); - OPAL_THREAD_UNLOCK(&module->m_lock); - } - } - - /* flush so things actually get sent out and resources restored */ - ompi_osc_rdma_flush(module); - } -} - - -/* create the initial fragment, pack header, datatype, and payload (if - size fits) and send */ -int -ompi_osc_rdma_sendreq_send(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq) -{ - int ret = OMPI_SUCCESS; - mca_bml_base_endpoint_t *endpoint = NULL; - mca_bml_base_btl_t *bml_btl = NULL; - mca_btl_base_module_t* btl = NULL; - mca_btl_base_descriptor_t *descriptor = NULL; - ompi_osc_rdma_send_header_t *header = NULL; - size_t written_data = 0; - size_t offset; - size_t needed_len = sizeof(ompi_osc_rdma_send_header_t); - const void *packed_ddt; - size_t packed_ddt_len, remain; - - if ((module->m_eager_send_active) && - (module->m_use_rdma) && - (ompi_datatype_is_contiguous_memory_layout(sendreq->req_target_datatype, - sendreq->req_target_count)) && - (!opal_convertor_need_buffers(&sendreq->req_origin_convertor)) && - (sendreq->req_type != OMPI_OSC_RDMA_ACC)) { - ret = ompi_osc_rdma_sendreq_rdma(module, sendreq); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) return ret; - } - - /* we always need to send the ddt */ - packed_ddt_len = ompi_datatype_pack_description_length(sendreq->req_target_datatype); - needed_len += packed_ddt_len; - if (OMPI_OSC_RDMA_GET != sendreq->req_type) { - needed_len += sendreq->req_origin_bytes_packed; - } - - /* Reuse the buffer if: - * - The whole message will fit - * - The header and datatype will fit AND the payload would be long anyway - * Note that if the datatype is too big for an eager, we'll fall - * through and return an error out of the new buffer case */ - if ((module->m_pending_buffers[sendreq->req_target_rank].remain_len >= needed_len) || - ((sizeof(ompi_osc_rdma_send_header_t) + packed_ddt_len < - module->m_pending_buffers[sendreq->req_target_rank].remain_len) && - (needed_len > module->m_pending_buffers[sendreq->req_target_rank].bml_btl->btl->btl_eager_limit))) { - bml_btl = module->m_pending_buffers[sendreq->req_target_rank].bml_btl; - descriptor = module->m_pending_buffers[sendreq->req_target_rank].descriptor; - remain = module->m_pending_buffers[sendreq->req_target_rank].remain_len; - } else { - /* send the existing buffer */ - if (module->m_pending_buffers[sendreq->req_target_rank].descriptor) { - send_multi_buffer(module, sendreq->req_target_rank); - } - - /* get a buffer... */ - endpoint = (mca_bml_base_endpoint_t*) sendreq->req_target_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); - btl = bml_btl->btl; - mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - module->m_use_buffers ? btl->btl_eager_limit : - needed_len < btl->btl_eager_limit ? needed_len : - btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); - if (NULL == descriptor) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - /* verify at least enough space for header */ - if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_send_header_t) + packed_ddt_len) { - ret = MPI_ERR_TRUNCATE; - goto cleanup; - } - - /* setup descriptor */ - descriptor->des_cbfunc = ompi_osc_rdma_sendreq_send_cb; - - module->m_pending_buffers[sendreq->req_target_rank].bml_btl = bml_btl; - module->m_pending_buffers[sendreq->req_target_rank].descriptor = descriptor; - module->m_pending_buffers[sendreq->req_target_rank].remain_len = descriptor->des_src[0].seg_len - sizeof(ompi_osc_rdma_base_header_t); - remain = module->m_pending_buffers[sendreq->req_target_rank].remain_len; - descriptor->des_src[0].seg_len = 0; - } - - /* pack header */ - header = (ompi_osc_rdma_send_header_t*) - ((char*) descriptor->des_src[0].seg_addr.pval + descriptor->des_src[0].seg_len); - written_data += sizeof(ompi_osc_rdma_send_header_t); - header->hdr_base.hdr_flags = 0; - header->hdr_windx = ompi_comm_get_cid(sendreq->req_module->m_comm); - header->hdr_origin = ompi_comm_rank(sendreq->req_module->m_comm); - header->hdr_origin_sendreq.pval = (void*) sendreq; - header->hdr_origin_tag = 0; - header->hdr_target_disp = sendreq->req_target_disp; - header->hdr_target_count = sendreq->req_target_count; - - switch (sendreq->req_type) { - case OMPI_OSC_RDMA_PUT: - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_PUT; -#if OPAL_ENABLE_MEM_DEBUG - header->hdr_target_op = 0; -#endif - break; - - case OMPI_OSC_RDMA_ACC: - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_ACC; - header->hdr_target_op = sendreq->req_op_id; - break; - - case OMPI_OSC_RDMA_GET: - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_GET; -#if OPAL_ENABLE_MEM_DEBUG - header->hdr_target_op = 0; -#endif - sendreq->req_refcount++; - break; - } - - /* Set datatype id and / or pack datatype */ - ret = ompi_datatype_get_pack_description(sendreq->req_target_datatype, &packed_ddt); - if (OMPI_SUCCESS != ret) goto cleanup; - memcpy((unsigned char*) descriptor->des_src[0].seg_addr.pval + descriptor->des_src[0].seg_len + written_data, - packed_ddt, packed_ddt_len); - written_data += packed_ddt_len; - - if (OMPI_OSC_RDMA_GET != sendreq->req_type) { - /* if sending data and it fits, pack payload */ - if (remain >= written_data + sendreq->req_origin_bytes_packed) { - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = sendreq->req_origin_bytes_packed; - - iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) descriptor->des_src[0].seg_addr.pval + descriptor->des_src[0].seg_len + written_data); - - ret = opal_convertor_pack(&sendreq->req_origin_convertor, &iov, &iov_count, - &max_data ); - if (ret < 0) { - ret = OMPI_ERR_FATAL; - goto cleanup; - } - - written_data += max_data; - descriptor->des_src[0].seg_len += written_data; - - header->hdr_msg_length = sendreq->req_origin_bytes_packed; - } else { - descriptor->des_src[0].seg_len += written_data; - - header->hdr_msg_length = 0; - header->hdr_origin_tag = create_send_tag(module); - } - } else { - descriptor->des_src[0].seg_len += written_data; - header->hdr_msg_length = 0; - } - module->m_pending_buffers[sendreq->req_target_rank].remain_len -= written_data; - - if (module->m_use_buffers) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_MULTI; - - /* When putting multiple messages in a single buffer, the - * starting point for the next message needs to be aligned with - * pointer addresses. Therefore, the pointer, amount written - * and space remaining are adjusted forward so that the - * starting position for the next message is aligned properly. - * The amount of this alignment is embedded in the hdr_flags - * field so the callback completion and receiving side can - * also know how much to move the pointer to find the starting - * point of the next header. This strict alignment is - * required by certain platforms like SPARC. Without it, - * bus errors can occur. Keeping things aligned also may - * offer some performance improvements on other platforms. - */ - offset = OPAL_ALIGN_PAD_AMOUNT(descriptor->des_src[0].seg_len, sizeof(uint64_t)); - if (0 != offset) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_ALIGN_MASK & offset; - descriptor->des_src[0].seg_len += offset; - written_data += offset; - module->m_pending_buffers[sendreq->req_target_rank].remain_len -= offset; - } - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (sendreq->req_target_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_SEND_HDR_HTON(*header); - } -#endif - - if (module->m_pending_buffers[sendreq->req_target_rank].remain_len < - sizeof(ompi_osc_rdma_send_header_t) + 128) { - /* not enough space left - send now */ - ret = send_multi_buffer(module, sendreq->req_target_rank); - } else { - ret = OMPI_SUCCESS; - } - - goto done; - } else { -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (sendreq->req_target_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_SEND_HDR_HTON(*header); - } -#endif - - /* send fragment */ - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d sending sendreq to %d", - ompi_comm_rank(sendreq->req_module->m_comm), - sendreq->req_target_rank)); - - module->m_pending_buffers[sendreq->req_target_rank].bml_btl = NULL; - module->m_pending_buffers[sendreq->req_target_rank].descriptor = NULL; - module->m_pending_buffers[sendreq->req_target_rank].remain_len = 0; - - ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); - if (1 == ret) ret = OMPI_SUCCESS; - goto done; - } - - cleanup: - if (descriptor != NULL) { - mca_bml_base_free(bml_btl, descriptor); - } - - done: - return ret; -} - - -/********************************************************************** - * - * Sending a replyreq back to origin - * - **********************************************************************/ -static int -ompi_osc_rdma_replyreq_send_long_cb(ompi_request_t *request) -{ - ompi_osc_rdma_longreq_t *longreq = - (ompi_osc_rdma_longreq_t*) request->req_complete_cb_data; - ompi_osc_rdma_replyreq_t *replyreq = longreq->req_basereq.req_replyreq; - - inmsg_mark_complete(replyreq->rep_module); - - ompi_osc_rdma_longreq_free(longreq); - ompi_osc_rdma_replyreq_free(replyreq); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -static void -ompi_osc_rdma_replyreq_send_cb(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - ompi_osc_rdma_replyreq_t *replyreq = - (ompi_osc_rdma_replyreq_t*) descriptor->des_cbdata; - ompi_osc_rdma_reply_header_t *header = - (ompi_osc_rdma_reply_header_t*) descriptor->des_src[0].seg_addr.pval; - - if (OMPI_SUCCESS != status) { - /* requeue and return */ - /* BWB - FIX ME - figure out where to put this bad boy */ - abort(); - return; - } - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (header->hdr_base.hdr_flags & OMPI_OSC_RDMA_HDR_FLAG_NBO) { - OMPI_OSC_RDMA_REPLY_HDR_NTOH(*header); - } -#endif - - /* do we need to post a send? */ - if (header->hdr_msg_length != 0) { - /* sendreq is done. Mark it as so and get out of here */ - inmsg_mark_complete(replyreq->rep_module); - ompi_osc_rdma_replyreq_free(replyreq); - } else { - ompi_osc_rdma_longreq_t *longreq; - ompi_osc_rdma_longreq_alloc(&longreq); - longreq->req_basereq.req_replyreq = replyreq; - - ompi_osc_rdma_component_isend(replyreq->rep_target_convertor.pBaseBuf, - replyreq->rep_target_convertor.count, - replyreq->rep_target_datatype, - replyreq->rep_origin_rank, - header->hdr_target_tag, - replyreq->rep_module->m_comm, - &(longreq->request), - ompi_osc_rdma_replyreq_send_long_cb, - longreq); - } - - /* release the descriptor and replyreq */ - btl->btl_free(btl, descriptor); -} - - -int -ompi_osc_rdma_replyreq_send(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_replyreq_t *replyreq) -{ - int ret = OMPI_SUCCESS; - mca_bml_base_endpoint_t *endpoint = NULL; - mca_bml_base_btl_t *bml_btl = NULL; - mca_btl_base_descriptor_t *descriptor = NULL; - ompi_osc_rdma_reply_header_t *header = NULL; - size_t written_data = 0; - - /* Get a BTL and a fragment to go with it */ - endpoint = (mca_bml_base_endpoint_t*) replyreq->rep_origin_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); - mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - bml_btl->btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); - if (NULL == descriptor) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - /* verify at least enough space for header */ - if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_reply_header_t)) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - - /* setup descriptor */ - descriptor->des_cbfunc = ompi_osc_rdma_replyreq_send_cb; - descriptor->des_cbdata = (void*) replyreq; - - /* pack header */ - header = (ompi_osc_rdma_reply_header_t*) descriptor->des_src[0].seg_addr.pval; - written_data += sizeof(ompi_osc_rdma_reply_header_t); - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_REPLY; - header->hdr_base.hdr_flags = 0; - header->hdr_origin_sendreq = replyreq->rep_origin_sendreq; - header->hdr_target_tag = 0; - - /* if sending data fits, pack payload */ - if (descriptor->des_src[0].seg_len >= - written_data + replyreq->rep_target_bytes_packed) { - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = replyreq->rep_target_bytes_packed; - - iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*) descriptor->des_src[0].seg_addr.pval + written_data); - - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &replyreq->rep_target_convertor); - ); - ret = opal_convertor_pack(&replyreq->rep_target_convertor, &iov, &iov_count, - &max_data ); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &replyreq->rep_target_convertor); - ); - - if (ret < 0) { - ret = OMPI_ERR_FATAL; - goto cleanup; - } - - assert(max_data == replyreq->rep_target_bytes_packed); - written_data += max_data; - descriptor->des_src[0].seg_len = written_data; - - header->hdr_msg_length = replyreq->rep_target_bytes_packed; - } else { - header->hdr_msg_length = 0; - header->hdr_target_tag = create_send_tag(module); - } - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (replyreq->rep_origin_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_REPLY_HDR_HTON(*header); - } -#endif - - /* send fragment */ - ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); - if (1 == ret) ret = OMPI_SUCCESS; - goto done; - - cleanup: - if (descriptor != NULL) { - mca_bml_base_free(bml_btl, descriptor); - } - - done: - return ret; -} - - -/********************************************************************** - * - * Receive a put on the target side - * - **********************************************************************/ -static int -ompi_osc_rdma_sendreq_recv_put_long_cb(ompi_request_t *request) -{ - ompi_osc_rdma_longreq_t *longreq = - (ompi_osc_rdma_longreq_t*) request->req_complete_cb_data; - - OBJ_RELEASE(longreq->req_datatype); - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d finished receiving long put message", - ompi_comm_rank(longreq->req_module->m_comm))); + "%d: process_put_long: received message from %d", + ompi_comm_rank(module->comm), + source)); - inmsg_mark_complete(longreq->req_module); - ompi_osc_rdma_longreq_free(longreq); + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } - ompi_request_free(&request); + ret = ompi_osc_rdma_component_irecv (module, target, + put_header->count, + datatype, source, + put_header->tag, + module->comm); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, + "%d: process_put_long: irecv error: %d", + ompi_comm_rank(module->comm), + ret)); + return OMPI_ERROR; + } + + OBJ_RELEASE(datatype); + + return put_header->len; +} + +/** + * osc_rdma_incomming_req_omplete: + * + * @short Completion callback for a send/receive associate with an access + * epoch. + * + * @param[in] request - PML request with an OSC RMDA module as the callback data. + * + * @long This function is called when a send or recieve associated with an + * access epoch completes. When fired this function will increment the + * passive or active incoming count. + */ +static int osc_rdma_incomming_req_omplete (ompi_request_t *request) +{ + ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data; + /* we need to peer rank. get it from the pml request */ + mca_pml_base_request_t *pml_request = (mca_pml_base_request_t *) request; + int rank = MPI_PROC_NULL; + + if (request->req_status.MPI_TAG & 0x01) { + rank = pml_request->req_peer; + } + + mark_incoming_completion (module, rank); + + /* put this request on the garbage colletion list */ + OPAL_THREAD_LOCK(&module->lock); + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + OPAL_THREAD_UNLOCK(&module->lock); return OMPI_SUCCESS; } - -int -ompi_osc_rdma_sendreq_recv_put(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_send_header_t *header, - void **inbuf) +/** + * @short Post a send to match the remote receive for a get operation. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source buffer + * @param[in] count - Number of elements in the source buffer + * @param[in] datatype - Type of source elements. + * @param[in] peer - Remote process that has the receive posted + * @param[in] tag - Tag for the send + * + * @long This function posts a send to match the receive posted as part + * of a get operation. When this send is complete the get is considered + * complete at the target (this process). + */ +static int osc_rdma_get_post_send (ompi_osc_rdma_module_t *module, void *source, int count, + ompi_datatype_t *datatype, int peer, int tag) { - int ret = OMPI_SUCCESS; - void *target = (unsigned char*) module->m_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->m_win->w_disp_unit); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, header->hdr_origin ); - struct ompi_datatype_t *datatype = - ompi_osc_base_datatype_create(proc, inbuf); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->m_comm, 1, false); - } - - if (header->hdr_msg_length > 0) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - ompi_proc_t *proc; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* initialize convertor */ - proc = ompi_comm_peer_lookup(module->m_comm, header->hdr_origin); - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(datatype->super), - header->hdr_target_count, - target, - 0, - &convertor); - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)*inbuf; - max_data = iov.iov_len; - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, &convertor); - ); - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data ); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, &convertor); - ); - OBJ_DESTRUCT(&convertor); - OBJ_RELEASE(datatype); - inmsg_mark_complete(module); - *inbuf = ((char*) *inbuf) + header->hdr_msg_length; - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d received put message from %d", - ompi_comm_rank(module->m_comm), - header->hdr_origin)); - - } else { - ompi_osc_rdma_longreq_t *longreq; - ompi_osc_rdma_longreq_alloc(&longreq); - longreq->req_datatype = datatype; - longreq->req_module = module; - - ompi_osc_rdma_component_irecv(target, - header->hdr_target_count, - datatype, - header->hdr_origin, - header->hdr_origin_tag, - module->m_comm, - &(longreq->request), - ompi_osc_rdma_sendreq_recv_put_long_cb, - longreq); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d started long recv put message from %d (%d)", - ompi_comm_rank(module->m_comm), - header->hdr_origin, - header->hdr_origin_tag)); - } - - return ret; + return ompi_osc_rdma_isend_w_cb (source, count, datatype, peer, tag, module->comm, + osc_rdma_incomming_req_omplete, module); } - - - -/********************************************************************** +/** + * process_get: * - * Receive an accumulate on the target side + * @short Process a get message from a remote peer * - **********************************************************************/ - - -static int -ompi_osc_rdma_sendreq_recv_accum_long_cb(ompi_request_t *request) + * @param[in] module - OSC RDMA module + * @param[in] target - Peer process + * @param[in] get_header - Incoming message header + */ +static inline int process_get (ompi_osc_rdma_module_t* module, int target, + ompi_osc_rdma_header_get_t* get_header) { - ompi_osc_rdma_longreq_t *longreq = - (ompi_osc_rdma_longreq_t*) request->req_complete_cb_data; - ompi_osc_rdma_send_header_t *header = longreq->req_basereq.req_sendhdr; - void *payload = (void*) (header + 1); - ompi_osc_rdma_module_t *module = longreq->req_module; - unsigned char *target_buffer = - (unsigned char*) module->m_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->m_win->w_disp_unit); + char *data = (char *) (get_header + 1); + struct ompi_datatype_t *datatype; + void *source = (unsigned char*) module->baseptr + + ((unsigned long) get_header->displacement * module->disp_unit); + int ret; - /* lock the window for accumulates */ - OPAL_THREAD_LOCK(&longreq->req_module->m_acc_lock); - - if (longreq->req_op == &ompi_mpi_op_replace.op) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(ompi_proc_local()->proc_convertor, - &(longreq->req_datatype->super), - header->hdr_target_count, - target_buffer, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*) payload; - max_data = iov.iov_len; - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &convertor); - ); - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &convertor); - ); - OBJ_DESTRUCT(&convertor); - } else { - /* copy the data from the temporary buffer into the user window */ - (void)ompi_osc_base_process_op(target_buffer, - payload, - header->hdr_msg_length, - longreq->req_datatype, - header->hdr_target_count, - longreq->req_op); - } - - /* unlock the window for accumulates */ - OPAL_THREAD_UNLOCK(&longreq->req_module->m_acc_lock); - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d finished receiving long accum message from %d", - ompi_comm_rank(longreq->req_module->m_comm), - header->hdr_origin)); + "%d: process_get: received message from %d", + ompi_comm_rank(module->comm), + target)); - /* free the temp buffer */ - free(longreq->req_basereq.req_sendhdr); - - /* Release datatype & op */ - OBJ_RELEASE(longreq->req_datatype); - OBJ_RELEASE(longreq->req_op); - - inmsg_mark_complete(longreq->req_module); - - ompi_osc_rdma_longreq_free(longreq); - - ompi_request_free(&request); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_sendreq_recv_accum(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_send_header_t *header, - void **payload) -{ - int ret = OMPI_SUCCESS; - struct ompi_op_t *op = ompi_osc_base_op_create(header->hdr_target_op); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, header->hdr_origin ); - struct ompi_datatype_t *datatype = - ompi_osc_base_datatype_create(proc, payload); - - if (NULL == datatype) { - opal_output(ompi_osc_base_framework.framework_output, - "Error recreating datatype. Aborting."); - ompi_mpi_abort(module->m_comm, 1, false); + ret = datatype_create (module, target, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; } - if (header->hdr_msg_length > 0) { - unsigned char *target_buffer; + /* send get data */ + ret = osc_rdma_get_post_send (module, source, get_header->count, datatype, target, get_header->tag); - target_buffer = (unsigned char*) module->m_win->w_baseptr + - ((unsigned long)header->hdr_target_disp * module->m_win->w_disp_unit); + OBJ_RELEASE(datatype); - /* lock the window for accumulates */ - OPAL_THREAD_LOCK(&module->m_acc_lock); + return OMPI_SUCCESS == ret ? (int) get_header->len : ret; +} - if (op == &ompi_mpi_op_replace.op) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; +/** + * osc_rdma_accumulate_buffer: + * + * @short Accumulate data into the target buffer. + * + * @param[in] target - Target buffer + * @param[in] source - Source buffer + * @param[in] source_len - Length of source buffer in bytes + * @param[in] proc - Source proc + * @param[in] count - Number of elements in target buffer + * @param[in] datatype - Type of elements in target buffer + * @param[in] op - Operation to be performed + */ +static inline int osc_rdma_accumulate_buffer (void *target, void *source, size_t source_len, ompi_proc_t *proc, + int count, ompi_datatype_t *datatype, ompi_op_t *op) +{ + void *buffer = source; + int ret; - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); + assert (NULL != target && NULL != source); - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(datatype->super), - header->hdr_target_count, - target_buffer, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)*payload; - max_data = iov.iov_len; - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, &convertor); - ); - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, &convertor); - ); - OBJ_DESTRUCT(&convertor); - } else { - void *buffer = NULL; + if (op == &ompi_mpi_op_replace.op) { + osc_rdma_copy_on_recv (target, source, source_len, proc, count, datatype); + return OMPI_SUCCESS; + } #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch != ompi_proc_local()->proc_arch) { - opal_convertor_t convertor; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - ompi_datatype_t *primitive_datatype = NULL; - uint32_t primitive_count; - size_t buflen; - - ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); - primitive_count *= header->hdr_target_count; - - /* figure out how big a buffer we need */ - ompi_datatype_type_size(primitive_datatype, &buflen); - buflen *= primitive_count; - - /* create convertor */ - OBJ_CONSTRUCT(&convertor, opal_convertor_t); - - payload = (void*) malloc(buflen); - - /* initialize convertor */ - opal_convertor_copy_and_prepare_for_recv(proc->proc_convertor, - &(primitive_datatype->super), - primitive_count, - buffer, - 0, - &convertor); - - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)*payload; - max_data = iov.iov_len; - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, &convertor); - ); - opal_convertor_unpack(&convertor, - &iov, - &iov_count, - &max_data); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, &convertor); - ); - OBJ_DESTRUCT(&convertor); - } else { - buffer = *payload; - } -#else - buffer = *payload; -#endif - /* copy the data from the temporary buffer into the user window */ - ret = ompi_osc_base_process_op(target_buffer, - buffer, - header->hdr_msg_length, - datatype, - header->hdr_target_count, - op); - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch != ompi_proc_local()->proc_arch) { - if (NULL == buffer) free(buffer); - } -#endif - } - - /* unlock the window for accumulates */ - OPAL_THREAD_UNLOCK(&module->m_acc_lock); - - /* Release datatype & op */ - OBJ_RELEASE(datatype); - OBJ_RELEASE(op); - - inmsg_mark_complete(module); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d received accum message from %d", - ompi_comm_rank(module->m_comm), - header->hdr_origin)); - *payload = ((char*) *payload) + header->hdr_msg_length; - - } else { - ompi_osc_rdma_longreq_t *longreq; - size_t buflen; - struct ompi_datatype_t *primitive_datatype = NULL; + if (proc->proc_arch != ompi_proc_local()->proc_arch) { + ompi_datatype_t *primitive_datatype = NULL; uint32_t primitive_count; + size_t buflen; - /* get underlying type... */ ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count); primitive_count *= header->hdr_target_count; @@ -1172,272 +483,1047 @@ ompi_osc_rdma_sendreq_recv_accum(ompi_osc_rdma_module_t *module, ompi_datatype_type_size(primitive_datatype, &buflen); buflen *= primitive_count; - /* get a longreq and fill it in */ - ompi_osc_rdma_longreq_alloc(&longreq); + buffer = malloc (buflen); + if (OPAL_UNLIKELY(NULL == buffer)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } - longreq->req_datatype = datatype; - longreq->req_op = op; - longreq->req_module = module; - - /* allocate a buffer to receive into ... */ - longreq->req_basereq.req_sendhdr = (ompi_osc_rdma_send_header_t *) malloc(buflen + sizeof(ompi_osc_rdma_send_header_t)); - - if (NULL == longreq->req_basereq.req_sendhdr) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - /* fill in tmp header */ - memcpy(longreq->req_basereq.req_sendhdr, header, - sizeof(ompi_osc_rdma_send_header_t)); - longreq->req_basereq.req_sendhdr->hdr_msg_length = buflen; - - ompi_osc_rdma_component_irecv(longreq->req_basereq.req_sendhdr + 1, - primitive_count, - primitive_datatype, - header->hdr_origin, - header->hdr_origin_tag, - module->m_comm, - &(longreq->request), - ompi_osc_rdma_sendreq_recv_accum_long_cb, - longreq); - - OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, - "%d started long recv accum message from %d (%d)", - ompi_comm_rank(module->m_comm), - header->hdr_origin, - header->hdr_origin_tag)); + osc_rdma_copy_on_recv (buffer, source, source_len, proc, count, datatype); } +#endif + + /* copy the data from the temporary buffer into the user window */ + ret = ompi_osc_base_process_op(target, buffer, source_len, datatype, + count, op); + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if (proc->proc_arch != ompi_proc_local()->proc_arch) { + free(buffer); + } +#endif + + return ret; +} + +/** + * @short Create an accumulate data object. + * + * @param[in] module - RDMA OSC module + * @param[in] target - Target for the accumulation + * @param[in] source - Source of accumulate data. Must be allocated with malloc/calloc/etc + * @param[in] source_len - Length of the source buffer in bytes + * @param[in] proc - Source proc + * @param[in] count - Number of elements to accumulate + * @param[in] datatype - Datatype to accumulate + * @oaram[in] op - Operator + * @param[in] request_count - Number of prerequisite requests + * @param[out] acc_data_out - New accumulation data + * + * @long This function is used to create a copy of the data needed to perform an accumulation. + * This data should be provided to ompi_osc_rdma_isend_w_cb or ompi_osc_rdma_irecv_w_cb + * as the ctx parameter with accumulate_cb as the cb parameter. + */ +static int osc_rdma_accumulate_allocate (ompi_osc_rdma_module_t *module, void *target, void *source, size_t source_len, + ompi_proc_t *proc, int count, ompi_datatype_t *datatype, ompi_op_t *op, + int request_count, struct osc_rdma_accumulate_data_t **acc_data_out) +{ + struct osc_rdma_accumulate_data_t *acc_data; + + acc_data = malloc (sizeof (*acc_data)); + if (OPAL_UNLIKELY(NULL == acc_data)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + acc_data->module = module; + acc_data->target = target; + acc_data->source = source; + acc_data->source_len = source_len; + acc_data->proc = proc; + acc_data->count = count; + acc_data->datatype = datatype; + OBJ_RETAIN(datatype); + acc_data->op = op; + OBJ_RETAIN(op); + acc_data->request_count = request_count; + + *acc_data_out = acc_data; + + return OMPI_SUCCESS; +} + +static void osc_rdma_accumulate_free (struct osc_rdma_accumulate_data_t *acc_data) +{ + /* the source is always a temporary buffer */ + free (acc_data->source); + + OBJ_RELEASE(acc_data->datatype); + OBJ_RELEASE(acc_data->op); + + free (acc_data); +} + +/** + * @short Execute the accumulate once the request counter reaches 0. + * + * @param[in] request - request + * + * The request should be created with ompi_osc_rdma_isend_w_cb or ompi_osc_rdma_irecv_w_cb + * with ctx allocated by osc_rdma_accumulate_allocate. This callback will free the accumulate + * data once the accumulation operation is complete. + */ +static int accumulate_cb (ompi_request_t *request) +{ + struct osc_rdma_accumulate_data_t *acc_data = (struct osc_rdma_accumulate_data_t *) request->req_complete_cb_data; + int ret = OMPI_SUCCESS; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "accumulate_cb, request_count = %d", acc_data->request_count)); + + request->req_complete_cb_data = acc_data->module; + osc_rdma_incomming_req_omplete (request); + + --acc_data->request_count; + + if (0 != acc_data->request_count) { + /* more requests needed before the buffer can be accumulated */ + return OMPI_SUCCESS; + } + + if (acc_data->source) { + ret = osc_rdma_accumulate_buffer (acc_data->target, acc_data->source, acc_data->source_len, + acc_data->proc, acc_data->count, acc_data->datatype, acc_data->op); + } + + /* drop the accumulate lock */ + ompi_osc_rdma_accumulate_unlock (acc_data->module); + + osc_rdma_accumulate_free (acc_data); return ret; } -/********************************************************************** - * - * Recveive a get on the origin side - * - **********************************************************************/ -static int -ompi_osc_rdma_replyreq_recv_long_cb(ompi_request_t *request) +static int ompi_osc_rdma_acc_op_queue (ompi_osc_rdma_module_t *module, ompi_osc_rdma_header_t *header, int source, + char *data, size_t data_len, ompi_datatype_t *datatype) { - ompi_osc_rdma_longreq_t *longreq = - (ompi_osc_rdma_longreq_t*) request->req_complete_cb_data; - ompi_osc_rdma_sendreq_t *sendreq = - (ompi_osc_rdma_sendreq_t*) longreq->req_basereq.req_sendreq; - int32_t count; + osc_rdma_pending_acc_t *pending_acc; - OPAL_THREAD_LOCK(&sendreq->req_module->m_lock); - count = (sendreq->req_module->m_num_pending_out -= 1); - OPAL_THREAD_UNLOCK(&sendreq->req_module->m_lock); + pending_acc = OBJ_NEW(osc_rdma_pending_acc_t); + if (OPAL_UNLIKELY(NULL == pending_acc)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } - ompi_osc_rdma_longreq_free(longreq); - ompi_osc_rdma_sendreq_free(sendreq); + pending_acc->source = source; - if (0 == count) opal_condition_broadcast(&sendreq->req_module->m_cond); + /* save any inline data (eager acc, gacc only) */ + pending_acc->data_len = data_len; - ompi_request_free(&request); + if (data_len) { + pending_acc->data = malloc (data_len); + memcpy (pending_acc->data, data, data_len); + } + + /* save the datatype */ + pending_acc->datatype = datatype; + OBJ_RETAIN(datatype); + + /* save the header */ + switch (header->base.type) { + case OMPI_OSC_RDMA_HDR_TYPE_ACC: + case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG: + pending_acc->header.acc = header->acc; + break; + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC: + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG: + pending_acc->header.get_acc = header->get_acc; + break; + case OMPI_OSC_RDMA_HDR_TYPE_CSWAP: + pending_acc->header.cswap = header->cswap; + break; + default: + /* it is a coding error if any other header types are queued this way */ + assert (0); + } + + /* add to the pending acc queue */ + OPAL_THREAD_LOCK(&module->lock); + opal_list_append (&module->pending_acc, &pending_acc->super); + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; +} + +static int replace_cb (ompi_request_t *request) +{ + ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data; + + /* unlock the accumulate lock */ + ompi_osc_rdma_accumulate_unlock (module); + + return OMPI_SUCCESS; +} + +/** + * ompi_osc_rdma_acc_start: + * + * @short Start an accumulate with data operation. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] data - Accumulate data + * @param[in] data_len - Length of the accumulate data + * @param[in] datatype - Accumulation datatype + * @param[in] acc_header - Accumulate header + * + * The module's accumulation lock must be held before calling this + * function. It will release the lock when the operation is complete. + */ +static int ompi_osc_rdma_acc_start (ompi_osc_rdma_module_t *module, int source, void *data, size_t data_len, + ompi_datatype_t *datatype, ompi_osc_rdma_header_acc_t *acc_header) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) acc_header->displacement * module->disp_unit); + struct ompi_op_t *op = ompi_osc_base_op_create(acc_header->op); + ompi_proc_t *proc; + int ret; + + proc = ompi_comm_peer_lookup(module->comm, source); + assert (NULL != proc); + + ret = osc_rdma_accumulate_buffer (target, data, data_len, proc, acc_header->count, + datatype, op); + + OBJ_RELEASE(op); + + ompi_osc_rdma_accumulate_unlock (module); + + return ret; +} + +/** + * ompi_osc_rdma_acc_start: + * + * @short Start a long accumulate operation. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] datatype - Accumulation datatype + * @param[in] acc_header - Accumulate header + * + * The module's accumulation lock must be held before calling this + * function. It will release the lock when the operation is complete. + */ +static int ompi_osc_rdma_acc_long_start (ompi_osc_rdma_module_t *module, int source, ompi_datatype_t *datatype, + ompi_osc_rdma_header_acc_t *acc_header) { + struct osc_rdma_accumulate_data_t *acc_data; + size_t buflen; + void *buffer; + ompi_proc_t *proc; + void *target = (unsigned char*) module->baseptr + + ((unsigned long) acc_header->displacement * module->disp_unit); + struct ompi_op_t *op = ompi_osc_base_op_create(acc_header->op); + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_acc_long_start starting...")); + + proc = ompi_comm_peer_lookup(module->comm, source); + assert (NULL != proc); + + do { + if (op == &ompi_mpi_op_replace.op) { + ret = ompi_osc_rdma_irecv_w_cb (target, acc_header->count, datatype, source, + acc_header->tag, module->comm, NULL, + replace_cb, module); + break; + } + + buflen = datatype_buffer_length (datatype, acc_header->count); + + /* allocate a temporary buffer to receive the accumulate data */ + buffer = malloc (buflen); + if (OPAL_UNLIKELY(NULL == buffer)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + break; + } + + ret = osc_rdma_accumulate_allocate (module, target, buffer, buflen, proc, acc_header->count, + datatype, op, 1, &acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + free (buffer); + break; + } + + ret = ompi_osc_rdma_irecv_w_cb (buffer, acc_header->count, datatype, source, acc_header->tag, + module->comm, NULL, accumulate_cb, acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + osc_rdma_accumulate_free (acc_data); + } + } while (0); + + OBJ_RELEASE(op); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_osc_rdma_accumulate_unlock (module); + } + + return ret; +} + +/** + * ompi_osc_rdma_gacc_start: + * + * @short Start a accumulate with data + get operation. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] data - Accumulate data. Must be allocated on the heap. + * @param[in] data_len - Length of the accumulate data + * @param[in] datatype - Accumulation datatype + * @param[in] get_acc_header - Accumulate header + * + * The module's accumulation lock must be held before calling this + * function. It will release the lock when the operation is complete. + */ +static int ompi_osc_rdma_gacc_start (ompi_osc_rdma_module_t *module, int source, void *data, size_t data_len, + ompi_datatype_t *datatype, ompi_osc_rdma_header_get_acc_t *get_acc_header) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) get_acc_header->displacement * module->disp_unit); + struct ompi_op_t *op = ompi_osc_base_op_create(get_acc_header->op); + struct osc_rdma_accumulate_data_t *acc_data; + ompi_proc_t *proc; + int ret; + + proc = ompi_comm_peer_lookup(module->comm, source); + assert (NULL != proc); + + do { + ret = osc_rdma_accumulate_allocate (module, target, data, data_len, proc, get_acc_header->count, + datatype, op, 1, &acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; + } + + ret = ompi_osc_rdma_isend_w_cb (target, get_acc_header->count, datatype, source, get_acc_header->tag, + module->comm, accumulate_cb, acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + osc_rdma_accumulate_free (acc_data); + } + } while (0); + + OBJ_RELEASE(op); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_osc_rdma_accumulate_unlock (module); + } + + return ret; +} + +/** + * ompi_osc_rdma_gacc_long_start: + * + * @short Start a long accumulate + get operation. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] datatype - Accumulation datatype + * @param[in] get_acc_header - Accumulate header + * + * The module's accumulation lock must be held before calling this + * function. It will release the lock when the operation is complete. + */ +static int ompi_osc_gacc_long_start (ompi_osc_rdma_module_t *module, int source, ompi_datatype_t *datatype, + ompi_osc_rdma_header_get_acc_t *get_acc_header) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) get_acc_header->displacement * module->disp_unit); + struct ompi_op_t *op = ompi_osc_base_op_create(get_acc_header->op); + struct osc_rdma_accumulate_data_t *acc_data; + ompi_request_t *recv_request; + ompi_proc_t *proc; + size_t buflen; + void *buffer; + int ret; + + proc = ompi_comm_peer_lookup(module->comm, source); + assert (NULL != proc); + + /* allocate a temporary buffer to receive the accumulate data */ + buflen = datatype_buffer_length (datatype, get_acc_header->count); + + do { + buffer = malloc (buflen); + if (OPAL_UNLIKELY(NULL == buffer)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + break; + } + + ret = osc_rdma_accumulate_allocate (module, target, buffer, buflen, proc, get_acc_header->count, + datatype, op, 2, &acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; + } + + ret = ompi_osc_rdma_irecv_w_cb (buffer, get_acc_header->count, datatype, source, get_acc_header->tag, + module->comm, &recv_request, accumulate_cb, acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + osc_rdma_accumulate_free (acc_data); + break; + } + + ret = ompi_osc_rdma_isend_w_cb (target, get_acc_header->count, datatype, source, get_acc_header->tag, + module->comm, accumulate_cb, acc_data); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + /* cancel the receive and free the accumulate data */ + ompi_request_cancel (recv_request); + osc_rdma_accumulate_free (acc_data); + break; + } + } while (0); + + OBJ_RELEASE(op); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + ompi_osc_rdma_accumulate_unlock (module); + } + + return ret; +} + +/** + * ompi_osc_rdma_cswap_start: + * + * @short Start a compare and swap operation + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] data - Compare and swap data + * @param[in] data_len - Length of the compare and swap data. Must be exactly + * twice the size of the datatype. + * @param[in] datatype - Compare and swap datatype + * @param[in] cswap_header - Compare and swap header + * + * The module's accumulation lock must be held before calling this + * function. It will release the lock when the operation is complete. + */ +static int ompi_osc_rdma_cswap_start (ompi_osc_rdma_module_t *module, int source, void *data, ompi_datatype_t *datatype, + ompi_osc_rdma_header_cswap_t *cswap_header) +{ + void *target = (unsigned char*) module->baseptr + + ((unsigned long) cswap_header->displacement * module->disp_unit); + void *compare_addr, *origin_addr; + size_t datatype_size; + ompi_proc_t *proc; + int ret; + + proc = ompi_comm_peer_lookup(module->comm, source); + assert (NULL != proc); + + datatype_size = datatype->super.size; + + origin_addr = data; + compare_addr = (void *)((uintptr_t) data + datatype_size); + + do { + /* no reason to do a non-blocking send here */ + ret = MCA_PML_CALL(send(target, 1, datatype, source, cswap_header->tag, MCA_PML_BASE_SEND_STANDARD, + module->comm)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; + } + + /* increment the incomming fragment count so it matches what is expected */ + mark_incoming_completion (module, (cswap_header->tag & 0x1) ? source : MPI_PROC_NULL); + + if (0 == memcmp (target, compare_addr, datatype_size)) { + osc_rdma_copy_on_recv (target, origin_addr, datatype_size, proc, 1, datatype); + } + } while (0); + + ompi_osc_rdma_accumulate_unlock (module); + + return ret; +} + +/** + * ompi_osc_rdma_progress_pending_acc: + * + * @short Progress one pending accumulation or compare and swap operation. + * + * @param[in] module - OSC RDMA module + * + * @long If the accumulation lock can be aquired progress one pending + * accumulate or compare and swap operation. + */ +int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module) +{ + osc_rdma_pending_acc_t *pending_acc; + int ret; + + /* try to aquire the lock. it will be unlocked when the accumulate or cswap + * operation completes */ + if (ompi_osc_rdma_accumulate_trylock (module)) { + return OMPI_SUCCESS; + } + + pending_acc = (osc_rdma_pending_acc_t *) opal_list_remove_first (&module->pending_acc); + if (OPAL_UNLIKELY(NULL == pending_acc)) { + /* called without any pending accumulation operations */ + ompi_osc_rdma_accumulate_unlock (module); + return OMPI_SUCCESS; + } + + switch (pending_acc->header.base.type) { + case OMPI_OSC_RDMA_HDR_TYPE_ACC: + ret = ompi_osc_rdma_acc_start (module, pending_acc->source, pending_acc->data, pending_acc->data_len, + pending_acc->datatype, &pending_acc->header.acc); + free (pending_acc->data); + break; + case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG: + ret = ompi_osc_rdma_acc_long_start (module, pending_acc->source, pending_acc->datatype, + &pending_acc->header.acc); + break; + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC: + ret = ompi_osc_rdma_gacc_start (module, pending_acc->source, pending_acc->data, + pending_acc->data_len, pending_acc->datatype, + &pending_acc->header.get_acc); + break; + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG: + ret = ompi_osc_gacc_long_start (module, pending_acc->source, pending_acc->datatype, + &pending_acc->header.get_acc); + break; + case OMPI_OSC_RDMA_HDR_TYPE_CSWAP: + ret = ompi_osc_rdma_cswap_start (module, pending_acc->source, pending_acc->data, + pending_acc->datatype, &pending_acc->header.cswap); + break; + default: + ret = OMPI_ERROR; + /* it is a coding error if this point is reached */ + assert (0); + } + + pending_acc->data = NULL; + OBJ_RELEASE(pending_acc); + + return ret; +} + +static inline int process_acc (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_acc_t *acc_header) +{ + char *data = (char *) (acc_header + 1); + struct ompi_datatype_t *datatype; + uint64_t data_len; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_acc: received message from %d", + ompi_comm_rank(module->comm), + source)); + + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + data_len = acc_header->len - ((char*) data - (char*) acc_header); + + /* try to aquire the accumulate lock */ + if (0 == ompi_osc_rdma_accumulate_trylock (module)) { + ret = ompi_osc_rdma_acc_start (module, source, data, data_len, datatype, + acc_header); + } else { + /* couldn't aquire the accumulate lock so queue up the accumulate operation */ + ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) acc_header, + source, data, data_len, datatype); + } + + /* Release datatype & op */ + OBJ_RELEASE(datatype); + + return (OMPI_SUCCESS == ret) ? (int) acc_header->len : ret; +} + +static inline int process_acc_long (ompi_osc_rdma_module_t* module, int source, + ompi_osc_rdma_header_acc_t* acc_header) +{ + char *data = (char *) (acc_header + 1); + struct ompi_datatype_t *datatype; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_acc_long: received message from %d", + ompi_comm_rank(module->comm), + source)); + + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + if (0 == ompi_osc_rdma_accumulate_trylock (module)) { + ret = ompi_osc_rdma_acc_long_start (module, source, datatype, acc_header); + } else { + /* queue the operation */ + ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) acc_header, source, + NULL, 0, datatype); + } + + /* Release datatype & op */ + OBJ_RELEASE(datatype); + + return (OMPI_SUCCESS == ret) ? (int) acc_header->len : ret; +} + +static inline int process_get_acc(ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_get_acc_t *get_acc_header) +{ + char *data = (char *) (get_acc_header + 1); + struct ompi_datatype_t *datatype; + void *buffer = NULL; + uint64_t data_len; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_get_acc: received message from %d", + ompi_comm_rank(module->comm), + source)); + + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + data_len = get_acc_header->len - ((char*) data - (char*) get_acc_header); + + if (0 == ompi_osc_rdma_accumulate_trylock (module)) { + /* make a copy of the data since the buffer needs to be returned */ + if (data_len) { + buffer = malloc (data_len); + if (OPAL_UNLIKELY(NULL == buffer)) { + OBJ_RELEASE(datatype); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + memcpy (buffer, data, data_len); + } + + ret = ompi_osc_rdma_gacc_start (module, source, buffer, data_len, datatype, + get_acc_header); + } else { + /* queue the operation */ + ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) get_acc_header, + source, data, data_len, datatype); + } + + /* Release datatype & op */ + OBJ_RELEASE(datatype); + + return (OMPI_SUCCESS == ret) ? (int) get_acc_header->len : ret; +} + +static inline int process_get_acc_long(ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_get_acc_t *get_acc_header) +{ + char *data = (char *) (get_acc_header + 1); + struct ompi_datatype_t *datatype; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_acc: received message from %d", + ompi_comm_rank(module->comm), + source)); + + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + if (0 == ompi_osc_rdma_accumulate_trylock (module)) { + ret = ompi_osc_gacc_long_start (module, source, datatype, get_acc_header); + } else { + /* queue the operation */ + ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) get_acc_header, + source, NULL, 0, datatype); + } + + /* Release datatype & op */ + OBJ_RELEASE(datatype); + + return OMPI_SUCCESS == ret ? (int) get_acc_header->len : ret; +} + + +static inline int process_cswap (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_cswap_t *cswap_header) +{ + char *data = (char*) (cswap_header + 1); + struct ompi_datatype_t *datatype; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_cswap: received message from %d", + ompi_comm_rank(module->comm), + source)); + + ret = datatype_create (module, source, NULL, &datatype, (void **) &data); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + if (0 == ompi_osc_rdma_accumulate_trylock (module)) { + ret = ompi_osc_rdma_cswap_start (module, source, data, datatype, cswap_header); + } else { + /* queue the operation */ + ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) cswap_header, source, + data, 2 * datatype->super.size, datatype); + } + + /* Release datatype */ + OBJ_RELEASE(datatype); + + return (OMPI_SUCCESS == ret) ? (int) cswap_header->len : ret; +} + +static inline int process_complete (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_complete_t *complete_header) +{ + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: process_complete got complete message from %d", source)); + module->num_complete_msgs++; + + return sizeof (*complete_header); +} + +/* flush and unlock headers cannot be processed from the request callback + * because some btls do not provide re-entrant progress functions. these + * fragment will be progressed by the rdma component's progress function */ +static inline int process_flush (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_flush_t *flush_header) +{ + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "process_flush header = {.frag_count = %d}", flush_header->frag_count)); + + /* increase signal count by incoming frags */ + module->passive_incoming_frag_signal_count[source] += flush_header->frag_count; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "%d: process_flush: received message from %d. passive_incoming_frag_signal_count = %d, passive_incoming_frag_count = %d", + ompi_comm_rank(module->comm), source, module->passive_incoming_frag_signal_count[source], module->passive_incoming_frag_count[source])); + + ret = ompi_osc_rdma_process_flush (module, source, flush_header); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_pending_t *pending; + + pending = OBJ_NEW(ompi_osc_rdma_pending_t); + pending->module = module; + pending->source = source; + pending->header.flush = *flush_header; + + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + opal_list_append (&mca_osc_rdma_component.pending_operations, &pending->super); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); + + /* we now have to count the current fragment */ + module->passive_incoming_frag_signal_count[source]++; + } else { + /* need to account for the current fragment */ + module->passive_incoming_frag_count[source] = -1; + } + + return sizeof (*flush_header); +} + +static inline int process_unlock (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_unlock_t *unlock_header) +{ + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "process_unlock header = {.frag_count = %d}", unlock_header->frag_count)); + + /* increase signal count by incoming frags */ + module->passive_incoming_frag_signal_count[source] += unlock_header->frag_count; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: processing unlock request from %d. frag count = %d, signal_count = %d, processed_count = %d", + source, unlock_header->frag_count, (int) module->passive_incoming_frag_signal_count[source], + (int) module->passive_incoming_frag_count[source])); + + ret = ompi_osc_rdma_process_unlock (module, source, unlock_header); + if (OMPI_SUCCESS != ret) { + ompi_osc_rdma_pending_t *pending; + + pending = OBJ_NEW(ompi_osc_rdma_pending_t); + pending->module = module; + pending->source = source; + pending->header.unlock = *unlock_header; + + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + opal_list_append (&mca_osc_rdma_component.pending_operations, &pending->super); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); + + /* we now have to count the current fragment */ + module->passive_incoming_frag_signal_count[source]++; + } else { + /* need to account for the current fragment */ + module->passive_incoming_frag_count[source] = -1; + } + + return sizeof (*unlock_header); +} + +/* + * Do all the data movement associated with a fragment + */ +static inline int process_frag (ompi_osc_rdma_module_t *module, + ompi_osc_rdma_frag_header_t *frag) +{ + ompi_osc_rdma_header_t *header; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: process_frag: from %d, ops %d", + (int) frag->source, (int) frag->num_ops)); + + header = (ompi_osc_rdma_header_t *) (frag + 1); + + for (int i = 0 ; i < frag->num_ops ; ++i) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: process_frag: type 0x%x. offset = %u", header->base.type, + (unsigned) ((uintptr_t)header - (uintptr_t)frag))); + + switch (header->base.type) { + case OMPI_OSC_RDMA_HDR_TYPE_PUT: + ret = process_put(module, frag->source, &header->put); + break; + case OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG: + ret = process_put_long(module, frag->source, &header->put); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_ACC: + ret = process_acc(module, frag->source, &header->acc); + break; + case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG: + ret = process_acc_long (module, frag->source, &header->acc); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ: + ret = ompi_osc_rdma_process_lock(module, frag->source, &header->lock); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + ret = sizeof (header->lock); + } + break; + case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ: + ret = process_unlock(module, frag->source, &header->unlock); + + break; + case OMPI_OSC_RDMA_HDR_TYPE_GET: + ret = process_get (module, frag->source, &header->get); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_CSWAP: + ret = process_cswap (module, frag->source, &header->cswap); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC: + ret = process_get_acc (module, frag->source, &header->get_acc); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG: + ret = process_get_acc_long (module, frag->source, &header->get_acc); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ: + ret = process_flush (module, frag->source, &header->flush); + break; + + case OMPI_OSC_RDMA_HDR_TYPE_COMPLETE: + ret = process_complete (module, frag->source, &header->complete); + break; + + default: + opal_output(0, "Unsupported fragment type 0x%x\n", header->base.type); + abort(); /* FIX ME */ + } + if (ret <= 0) { + opal_output(0, "Error processing fragment: %d", ret); + abort(); /* FIX ME */ + } + + header = (ompi_osc_rdma_header_t *) ((uintptr_t) header + ret); + } + + return OMPI_SUCCESS; +} + + +/* dispatch for callback on message completion */ +static int ompi_osc_rdma_callback (ompi_request_t *request) +{ + ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data; + ompi_osc_rdma_header_base_t *base_header = + (ompi_osc_rdma_header_base_t *) module->incomming_buffer; + size_t incomming_length = request->req_status._ucount; + int source = request->req_status.MPI_SOURCE; + + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + assert(incomming_length >= sizeof(ompi_osc_rdma_header_base_t)); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "received rdma callback for fragment. source = %d, count = %u, type = 0x%x", + source, (unsigned) incomming_length, base_header->type)); + + switch (base_header->type) { + case OMPI_OSC_RDMA_HDR_TYPE_FRAG: + process_frag(module, (ompi_osc_rdma_frag_header_t *) base_header); + break; + case OMPI_OSC_RDMA_HDR_TYPE_POST: + (void) osc_rdma_incomming_post (module); + break; + case OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK: + ompi_osc_rdma_process_lock_ack(module, (ompi_osc_rdma_header_lock_ack_t *) base_header); + break; + case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK: + ompi_osc_rdma_process_flush_ack (module, source, (ompi_osc_rdma_header_flush_ack_t *) base_header); + break; + case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK: + ompi_osc_rdma_process_unlock_ack (module, source, (ompi_osc_rdma_header_unlock_ack_t *) base_header); + break; + default: + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "received unexpected message of type %x", + (int) base_header->type)); + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "finished processing incomming messages")); + + /* restart the receive request */ + OPAL_THREAD_LOCK(&module->lock); + + mark_incoming_completion (module, (base_header->flags & OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET) ? + source : MPI_PROC_NULL); + + osc_rdma_request_gc_clean (module); + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + ompi_osc_rdma_frag_start_receive (module); + + OPAL_THREAD_UNLOCK(&module->lock); + + OPAL_THREAD_LOCK(&ompi_request_lock); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "finished posting receive request")); + + return OMPI_SUCCESS; +} + +int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module) +{ + return ompi_osc_rdma_irecv_w_cb (module->incomming_buffer, mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t), + MPI_BYTE, OMPI_ANY_SOURCE, OSC_RDMA_FRAG_TAG, module->comm, &module->frag_request, + ompi_osc_rdma_callback, module); +} + +int ompi_osc_rdma_component_irecv (ompi_osc_rdma_module_t *module, void *buf, + size_t count, struct ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm) +{ + return ompi_osc_rdma_irecv_w_cb (buf, count, datatype, src, tag, comm, NULL, + osc_rdma_incomming_req_omplete, module); +} + + +static int +isend_completion_cb(ompi_request_t *request) +{ + ompi_osc_rdma_module_t *module = + (ompi_osc_rdma_module_t*) request->req_complete_cb_data; + + OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, + "isend_completion_cb called")); + + mark_outgoing_completion(module); + + /* put this request on the garbage colletion list */ + OPAL_THREAD_LOCK(&module->lock); + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + OPAL_THREAD_UNLOCK(&module->lock); return OMPI_SUCCESS; } int -ompi_osc_rdma_replyreq_recv(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq, - ompi_osc_rdma_reply_header_t *header, - void **payload) +ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module, + void *buf, + size_t count, + struct ompi_datatype_t *datatype, + int dest, + int tag, + struct ompi_communicator_t *comm) { - int ret = OMPI_SUCCESS; + return ompi_osc_rdma_isend_w_cb (buf, count, datatype, dest, tag, comm, + isend_completion_cb, module); +} - /* receive into user buffer */ - if (header->hdr_msg_length > 0) { - /* short message. woo! */ +int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag, + ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx) +{ + ompi_request_t *request; + int ret; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data; - int32_t count; + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: ompi_osc_rdma_isend_w_cb sending %d bytes to %d with tag %d", + count, target, tag)); - iov.iov_len = header->hdr_msg_length; - iov.iov_base = (IOVBASE_TYPE*)*payload; - max_data = iov.iov_len; - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &sendreq->req_origin_convertor); - ); - opal_convertor_unpack(&sendreq->req_origin_convertor, - &iov, - &iov_count, - &max_data ); - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_noaccess, - &sendreq->req_origin_convertor); - ); - - count = sendreq->req_module->m_num_pending_out -= 1; - ompi_osc_rdma_sendreq_free(sendreq); - *payload = ((char*) *payload) + header->hdr_msg_length; - - if (0 == count) opal_condition_broadcast(&sendreq->req_module->m_cond); - } else { - ompi_osc_rdma_longreq_t *longreq; - ompi_osc_rdma_longreq_alloc(&longreq); - - longreq->req_basereq.req_sendreq = sendreq; - longreq->req_module = module; - - ret = ompi_osc_rdma_component_irecv(sendreq->req_origin_convertor.pBaseBuf, - sendreq->req_origin_convertor.count, - sendreq->req_origin_datatype, - sendreq->req_target_rank, - header->hdr_target_tag, - module->m_comm, - &(longreq->request), - ompi_osc_rdma_replyreq_recv_long_cb, - longreq); + ret = MCA_PML_CALL(isend_init(ptr, count, datatype, target, tag, + MCA_PML_BASE_SEND_STANDARD, comm, &request)); + if (OMPI_SUCCESS != ret) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "error sending fragment. ret = %d", ret)); + return ret; } + request->req_complete_cb = cb; + request->req_complete_cb_data = ctx; + + ret = MCA_PML_CALL(start(1, &request)); + return ret; } - -/********************************************************************** - * - * Control message communication - * - **********************************************************************/ -static void -ompi_osc_rdma_control_send_cb(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) +int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag, + ompi_communicator_t *comm, ompi_request_t **request_out, + ompi_request_complete_fn_t cb, void *ctx) { - /* release the descriptor and sendreq */ - btl->btl_free(btl, descriptor); -} + ompi_request_t *request; + int ret; + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: ompi_osc_rdma_irecv_w_cb receiving %d bytes from %d with tag %d", + count, target, tag)); -int -ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module, - ompi_proc_t *proc, - uint8_t type, int32_t value0, int32_t value1) -{ - int ret = OMPI_SUCCESS; - mca_bml_base_endpoint_t *endpoint = NULL; - mca_bml_base_btl_t *bml_btl = NULL; - mca_btl_base_descriptor_t *descriptor = NULL; - ompi_osc_rdma_control_header_t *header = NULL; - - /* Get a BTL and a fragment to go with it */ - endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; - bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); - mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - sizeof(ompi_osc_rdma_control_header_t), - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); - if (NULL == descriptor) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; + ret = MCA_PML_CALL(irecv_init(ptr, count, datatype, target, tag, comm, &request)); + if (OMPI_SUCCESS != ret) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "error posting receive. ret = %d", ret)); + return ret; } - /* verify at least enough space for header */ - if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_control_header_t)) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; + request->req_complete_cb = cb; + request->req_complete_cb_data = ctx; + if (request_out) { + *request_out = request; } - /* setup descriptor */ - descriptor->des_cbfunc = ompi_osc_rdma_control_send_cb; - descriptor->des_cbdata = NULL; - descriptor->des_src[0].seg_len = sizeof(ompi_osc_rdma_control_header_t); + ret = MCA_PML_CALL(start(1, &request)); - /* pack header */ - header = (ompi_osc_rdma_control_header_t*) descriptor->des_src[0].seg_addr.pval; - header->hdr_base.hdr_type = type; - header->hdr_base.hdr_flags = 0; - header->hdr_value[0] = value0; - header->hdr_value[1] = value1; - header->hdr_windx = ompi_comm_get_cid(module->m_comm); - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_CONTROL_HDR_HTON(*header); - } -#endif - - /* send fragment */ - ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); - if (1 == ret) ret = OMPI_SUCCESS; - goto done; - - cleanup: - if (descriptor != NULL) { - mca_bml_base_free(bml_btl, descriptor); - } - - done: - return ret; -} - - -int -ompi_osc_rdma_rdma_ack_send(ompi_osc_rdma_module_t *module, - ompi_proc_t *proc, - ompi_osc_rdma_btl_t *rdma_btl) -{ - int ret = OMPI_SUCCESS; - mca_bml_base_btl_t *bml_btl = rdma_btl->bml_btl; - mca_btl_base_descriptor_t *descriptor = NULL; - ompi_osc_rdma_control_header_t *header = NULL; - - /* Get a BTL and a fragment to go with it */ - mca_bml_base_alloc(bml_btl, &descriptor, rdma_btl->rdma_order, - sizeof(ompi_osc_rdma_control_header_t), - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); - if (NULL == descriptor) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto cleanup; - } - - /* verify at least enough space for header */ - if (descriptor->des_src[0].seg_len < sizeof(ompi_osc_rdma_control_header_t)) { - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - - /* setup descriptor */ - descriptor->des_cbfunc = ompi_osc_rdma_control_send_cb; - descriptor->des_cbdata = NULL; - descriptor->des_src[0].seg_len = sizeof(ompi_osc_rdma_control_header_t); - - /* pack header */ - header = (ompi_osc_rdma_control_header_t*) descriptor->des_src[0].seg_addr.pval; - header->hdr_base.hdr_type = OMPI_OSC_RDMA_HDR_RDMA_COMPLETE; - header->hdr_base.hdr_flags = 0; - header->hdr_value[0] = rdma_btl->num_sent; - header->hdr_value[1] = 0; - header->hdr_windx = ompi_comm_get_cid(module->m_comm); - -#ifdef WORDS_BIGENDIAN - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; -#elif OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if (proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - header->hdr_base.hdr_flags |= OMPI_OSC_RDMA_HDR_FLAG_NBO; - OMPI_OSC_RDMA_CONTROL_HDR_HTON(*header); - } -#endif - - assert(header->hdr_base.hdr_flags == 0); - - /* send fragment */ - ret = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_OSC_RDMA); - if (1 == ret) ret = OMPI_SUCCESS; - goto done; - - cleanup: - if (descriptor != NULL) { - mca_bml_base_free(bml_btl, descriptor); - } - - done: return ret; } diff --git a/ompi/mca/osc/rdma/osc_rdma_data_move.h b/ompi/mca/osc/rdma/osc_rdma_data_move.h index 2e82cc5b97..4199526829 100644 --- a/ompi/mca/osc/rdma/osc_rdma_data_move.h +++ b/ompi/mca/osc/rdma/osc_rdma_data_move.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. @@ -7,6 +8,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,46 +21,125 @@ #ifndef OMPI_MCA_OSC_RDMA_DATA_MOVE_H #define OMPI_MCA_OSC_RDMA_DATA_MOVE_H -#include "osc_rdma_sendreq.h" -#include "osc_rdma_replyreq.h" - -/* send a sendreq (the request from the origin for a Put, Get, or - Accumulate, including the payload for Put and Accumulate) */ -int ompi_osc_rdma_sendreq_send(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq); - -/* send a replyreq (the request from the target of a Get, with the - payload for the origin */ -int ompi_osc_rdma_replyreq_send(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_replyreq_t *replyreq); - -/* receive the target side of a sendreq for a put, directly into the user's window */ -int ompi_osc_rdma_sendreq_recv_put(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_send_header_t *header, - void **payload); - -/* receive the target side of a sendreq for an accumulate, possibly - using a temproart buffer, then calling the reduction functions */ -int ompi_osc_rdma_sendreq_recv_accum(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_send_header_t *header, - void **payload); - -/* receive the origin side of a replyreq (the reply part of an - MPI_Get), directly into the user's window */ -int ompi_osc_rdma_replyreq_recv(ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t *sendreq, - ompi_osc_rdma_reply_header_t *header, - void **payload); +#include "osc_rdma_header.h" int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module, - ompi_proc_t *proc, - uint8_t type, - int32_t value0, int32_t value1); + int target, + void *data, + size_t len); -int ompi_osc_rdma_rdma_ack_send(ompi_osc_rdma_module_t *module, - ompi_proc_t *proc, - ompi_osc_rdma_btl_t *rdma_btl); +/** + * ompi_osc_rdma_control_send_unbuffered: + * + * @short Send an unbuffered control message to a peer. + * + * @param[in] module - OSC RDMA module + * @param[in] target - Target rank + * @param[in] data - Data to send + * @param[in] len - Length of data + * + * @long Directly send a control message. This does not allocate a + * fragment, so should only be used when sending other messages would + * be erroneous (such as complete messages, when there may be queued + * transactions from an overlapping post that has already heard back + * from its peer). The buffer specified by data will be available + * when this call returns. + */ +int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module, + int target, void *data, size_t len); -int ompi_osc_rdma_flush(ompi_osc_rdma_module_t *module); +/** + * ompi_osc_rdma_isend_w_cb: + * + * @short Post a non-blocking send with a specified callback. + * + * @param[in] ptr - Source buffer. Will be available when the callback fires + * @param[in] count - Number of elements to send + * @param[in] datatype - Datatype of elements + * @param[in] source - Ranks to send data to + * @param[in] tag - Tag to use + * @param[in] comm - Communicator for communicating with rank + * @param[in] cb - Function to call when the request is complete + * @param[in] ctx - Context to store in new request for callback + * + * @long This function posts a new send request. Upon completion the function cb will + * be called with the associated request. The context specified in ctx will be stored in + * the req_completion_cb_data member of the ompi_request_t for use by the callback. + */ +int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag, + ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx); + +/** + * ompi_osc_rdma_irecv_w_cb: + * + * @short Post a non-blocking receive with a specified callback. + * + * @param[inout] ptr - Destination for incoming data + * @param[in] count - Number of elements to receive + * @param[in] datatype - Datatype of elements + * @param[in] source - Ranks to receive data from + * @param[in] tag - Tag to use + * @param[in] comm - Communicator for communicating with rank + * @param[in] request_out - Location to store new receive request (may be NULL) + * @param[in] cb - Function to call when the request is complete + * @param[in] ctx - Context to store in new request for callback + * + * @long This function posts a new request and stores the request in request_out if + * provided. Upon completion the function cb will be called with the associated + * request. The context specified in ctx will be stored in the req_completion_cb_data + * member of the ompi_request_t for use by the callback. + */ +int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag, + ompi_communicator_t *comm, ompi_request_t **request_out, + ompi_request_complete_fn_t cb, void *ctx); + +int ompi_osc_rdma_process_lock(ompi_osc_rdma_module_t* module, + int source, + struct ompi_osc_rdma_header_lock_t* lock_header); + +void ompi_osc_rdma_process_lock_ack(ompi_osc_rdma_module_t* module, + struct ompi_osc_rdma_header_lock_ack_t* lock_header); + +int ompi_osc_rdma_process_unlock(ompi_osc_rdma_module_t* module, + int source, + struct ompi_osc_rdma_header_unlock_t* lock_header); +int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_flush_t *flush_header); + +/** + * ompi_osc_rdma_process_unlock_ack: + * + * @short Process an incomming unlock acknowledgement. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] unlock_ack_header - Incoming unlock ack header + */ +void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header); + +/** + * ompi_osc_rdma_process_flush_ack: + * + * @short Process an incomming flush acknowledgement. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] flush_ack_header - Incoming flush ack header + */ +void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_flush_ack_t *flush_ack_header); + +/** + * ompi_osc_rdma_frag_start_receive: + * + * @short Start receiving fragments on the OSC module. + * + * @param[in] module - OSC module + * + * @long This function starts receiving eager fragments on the module. The current + * implementation uses the pml to transfer eager fragments. + */ +int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module); #endif diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.c b/ompi/mca/osc/rdma/osc_rdma_frag.c new file mode 100644 index 0000000000..dc2719d35b --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_frag.c @@ -0,0 +1,213 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/class/opal_list.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/pml/pml.h" + +#include "osc_rdma.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_data_move.h" + +static void ompi_osc_rdma_frag_constructor (ompi_osc_rdma_frag_t *frag){ + frag->buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t)); + assert (frag->buffer); +} + +static void ompi_osc_rdma_frag_destructor (ompi_osc_rdma_frag_t *frag) { + if (NULL != frag->buffer) { + free (frag->buffer); + } +} + +OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_list_item_t, + ompi_osc_rdma_frag_constructor, ompi_osc_rdma_frag_destructor); + +static int frag_send_cb (ompi_request_t *request) +{ + ompi_osc_rdma_frag_t *frag = + (ompi_osc_rdma_frag_t*) request->req_complete_cb_data; + ompi_osc_rdma_module_t *module = frag->module; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag_send complete to %d, frag = %p, request = %p", + frag->target, (void *) frag, (void *) request)); + + mark_outgoing_completion(module); + OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.frags, &frag->super); + + /* put this request on the garbage colletion list */ + opal_list_append (&module->request_gc, (opal_list_item_t *) request); + + return OMPI_SUCCESS; +} + + +static int +frag_send(ompi_osc_rdma_module_t *module, + ompi_osc_rdma_frag_t *frag) +{ + int count; + + count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag_send called to %d, frag = %p, count = %d", + frag->target, (void *) frag, count)); + + return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG, + module->comm, frag_send_cb, frag); +} + + +int +ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, + ompi_osc_rdma_frag_t *frag) +{ + int ret; + + assert(0 == frag->pending); + assert(module->peers[frag->target].active_frag != frag); + + /* we need to signal now that a frag is outgoing to ensure the count sent + * with the unlock message is correct */ + ompi_osc_signal_outgoing (module, frag->target, 1); + + /* if eager sends are not active, can't send yet, so buffer and + get out... */ + if (module->passive_target_access_epoch) { + if (!module->passive_eager_send_active[frag->target]) { + opal_list_append(&module->queued_frags, &frag->super); + return OMPI_SUCCESS; + } + } else { + if (!module->active_eager_send_active) { + opal_list_append(&module->queued_frags, &frag->super); + return OMPI_SUCCESS; + } + } + + ret = frag_send(module, frag); + + opal_condition_broadcast(&module->cond); + + return ret; +} + + +int +ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target) +{ + int ret = OMPI_SUCCESS; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush target begin")); + + /* flush the active frag */ + if (NULL != module->peers[target].active_frag) { + ompi_osc_rdma_frag_t *frag = module->peers[target].active_frag; + + if (0 != frag->pending) { + /* communication going on while synchronizing; this is a bug */ + return MPI_ERR_RMA_SYNC; + } + + module->peers[target].active_frag = NULL; + + ret = ompi_osc_rdma_frag_start(module, frag); + if (OMPI_SUCCESS != ret) return ret; + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush target finished active frag")); + + /* walk through the pending list and send */ + ompi_osc_rdma_frag_t *frag, *next; + OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) { + if (frag->target == target) { + opal_list_remove_item(&module->queued_frags, &frag->super); + ret = frag_send(module, frag); + if (OMPI_SUCCESS != ret) return ret; + } + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush target finished")); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module) +{ + int ret = OMPI_SUCCESS; + int i; + ompi_osc_rdma_frag_t *frag, *next; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush all begin")); + + /* flush the active frag */ + for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (NULL != module->peers[i].active_frag) { + ompi_osc_rdma_frag_t *frag = module->peers[i].active_frag; + + if (0 != frag->pending) { + /* communication going on while synchronizing; this is a bug */ + return MPI_ERR_RMA_SYNC; + } + + module->peers[i].active_frag = NULL; + + ret = ompi_osc_rdma_frag_start(module, frag); + if (OMPI_SUCCESS != ret) return ret; + } + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush all finished active frag")); + + /* try to start all the queued frags */ + OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) { + opal_list_remove_item(&module->queued_frags, &frag->super); + ret = frag_send(module, frag); + if (OMPI_SUCCESS != ret) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: failure for frag send: %d", ret)); + return ret; + } + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: frag flush all done")); + + return OMPI_SUCCESS; +} + +int osc_rdma_incomming_post (ompi_osc_rdma_module_t *module) +{ + OPAL_THREAD_LOCK(&module->lock); + module->num_post_msgs++; + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "received post message. num_post_msgs = %d", module->num_post_msgs)); + + if (0 == module->num_post_msgs) { + module->active_eager_send_active = true; + } + opal_condition_broadcast (&module->cond); + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.h b/ompi/mca/osc/rdma/osc_rdma_frag.h new file mode 100644 index 0000000000..3769a9dbbe --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_frag.h @@ -0,0 +1,131 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_RDMA_FRAG_H +#define OSC_RDMA_FRAG_H + +#include "ompi/communicator/communicator.h" + +#include "osc_rdma_header.h" +#include "osc_rdma_request.h" + +/** Communication buffer for packing messages */ +struct ompi_osc_rdma_frag_t { + opal_list_item_t super; + /* target rank of buffer */ + int target; + unsigned char *buffer; + + /* space remaining in buffer */ + size_t remain_len; + + /* start of unused space */ + char *top; + + /* Number of operations which have started writing into the frag, but not yet completed doing so */ + int pending; + ompi_osc_rdma_frag_header_t *header; + ompi_osc_rdma_module_t *module; +}; +typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t); + +extern int ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *buffer); +extern int ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target); +extern int ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module); + + +/* + * Note: module lock must be held during this operation + */ +static inline int ompi_osc_rdma_frag_alloc(ompi_osc_rdma_module_t *module, int target, + size_t request_len, ompi_osc_rdma_frag_t **buffer, + char **ptr) +{ + ompi_osc_rdma_frag_t *curr = module->peers[target].active_frag; + int ret; + + if (request_len > mca_osc_rdma_component.buffer_size) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + if (NULL == curr || curr->remain_len < request_len) { + opal_free_list_item_t *item; + + if (NULL != curr) { + curr->remain_len = 0; + /* If there's something pending, the pending finish will + start the buffer. Otherwise, we need to start it now. */ + if (0 == curr->pending) { + module->peers[target].active_frag = NULL; + ret = ompi_osc_rdma_frag_start(module, curr); + } + } + + OPAL_FREE_LIST_GET(&mca_osc_rdma_component.frags, + item, ret); + if (OMPI_SUCCESS != ret) return ret; + curr = module->peers[target].active_frag = + (ompi_osc_rdma_frag_t*) item; + + curr->target = target; + + curr->header = (ompi_osc_rdma_frag_header_t*) curr->buffer; + curr->top = (char*) (curr->header + 1); + curr->remain_len = mca_osc_rdma_component.buffer_size; + curr->module = module; + curr->pending = 0; + + curr->header->base.type = OMPI_OSC_RDMA_HDR_TYPE_FRAG; + curr->header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + if (module->passive_target_access_epoch) { + curr->header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET; + } + curr->header->source = ompi_comm_rank(module->comm); + curr->header->num_ops = 0; + curr->header->windx = ompi_comm_get_cid(module->comm); + + if (curr->remain_len < request_len) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + } + + *ptr = curr->top; + *buffer = curr; + + curr->top += request_len; + curr->remain_len -= request_len; + curr->pending++; + curr->header->num_ops++; + + return OMPI_SUCCESS; +} + + +/* + * Note: module lock must be held for this operation + */ +static inline int +ompi_osc_rdma_frag_finish(ompi_osc_rdma_module_t *module, + ompi_osc_rdma_frag_t* buffer) +{ + int ret = OMPI_SUCCESS; + + buffer->pending--; + if (0 == buffer->pending && 0 == buffer->remain_len) { + ret = ompi_osc_rdma_frag_start(module, buffer); + } + + return ret; +} + +#endif diff --git a/ompi/mca/osc/rdma/osc_rdma_header.h b/ompi/mca/osc/rdma/osc_rdma_header.h index 47ed2a1ab9..dd8d2bbe0a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_header.h +++ b/ompi/mca/osc/rdma/osc_rdma_header.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. @@ -7,14 +8,15 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -27,144 +29,169 @@ #include "opal/types.h" -/* Note -- 0x05 to 0x0C are of control_hdr type */ -#define OMPI_OSC_RDMA_HDR_PUT 0x01 -#define OMPI_OSC_RDMA_HDR_ACC 0x02 -#define OMPI_OSC_RDMA_HDR_GET 0x03 -#define OMPI_OSC_RDMA_HDR_REPLY 0x04 -#define OMPI_OSC_RDMA_HDR_POST 0x05 -#define OMPI_OSC_RDMA_HDR_COMPLETE 0x06 -#define OMPI_OSC_RDMA_HDR_LOCK_REQ 0x07 -#define OMPI_OSC_RDMA_HDR_UNLOCK_REQ 0x08 -#define OMPI_OSC_RDMA_HDR_UNLOCK_REPLY 0x09 -#define OMPI_OSC_RDMA_HDR_RDMA_COMPLETE 0x0A -#define OMPI_OSC_RDMA_HDR_MULTI_END 0x0B -#define OMPI_OSC_RDMA_HDR_RDMA_INFO 0x0C - -#define OMPI_OSC_RDMA_HDR_FLAG_ALIGN_MASK 0x0F -#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x10 -#define OMPI_OSC_RDMA_HDR_FLAG_MULTI 0x20 - -struct ompi_osc_rdma_base_header_t { - uint8_t hdr_type; - /* eventually, this will include endian information */ - uint8_t hdr_flags; +enum ompi_osc_rdma_hdr_type_t { + OMPI_OSC_RDMA_HDR_TYPE_PUT = 0x01, + OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG = 0x02, + OMPI_OSC_RDMA_HDR_TYPE_ACC = 0x03, + OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG = 0x04, + OMPI_OSC_RDMA_HDR_TYPE_GET = 0x05, + OMPI_OSC_RDMA_HDR_TYPE_CSWAP = 0x06, + OMPI_OSC_RDMA_HDR_TYPE_CSWAP_LONG = 0x07, + OMPI_OSC_RDMA_HDR_TYPE_GET_ACC = 0x08, + OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG = 0x09, + OMPI_OSC_RDMA_HDR_TYPE_COMPLETE = 0x10, + OMPI_OSC_RDMA_HDR_TYPE_POST = 0x11, + OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ = 0x12, + OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK = 0x13, + OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ = 0x14, + OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK = 0x15, + OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ = 0x16, + OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK = 0x17, + OMPI_OSC_RDMA_HDR_TYPE_FRAG = 0x20, }; -typedef struct ompi_osc_rdma_base_header_t ompi_osc_rdma_base_header_t; +typedef enum ompi_osc_rdma_hdr_type_t ompi_osc_rdma_hdr_type_t; -#define OMPI_OSC_RDMA_BASE_HDR_NTOH(h) -#define OMPI_OSC_RDMA_BASE_HDR_HTON(h) +#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01 +#define OMPI_OSC_RDMA_HDR_FLAG_VALID 0x02 +#define OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET 0x04 -struct ompi_osc_rdma_send_header_t { - ompi_osc_rdma_base_header_t hdr_base; - uint16_t hdr_windx; - - int32_t hdr_origin; - ompi_ptr_t hdr_origin_sendreq; - int32_t hdr_origin_tag; - - uint64_t hdr_target_disp; - int32_t hdr_target_count; - int32_t hdr_target_op; - - int32_t hdr_msg_length; /* 0 if payload is not included */ +struct ompi_osc_rdma_header_base_t { + /** fragment type. 8 bits */ + uint8_t type; + /** fragment flags. 8 bits */ + uint8_t flags; }; -typedef struct ompi_osc_rdma_send_header_t ompi_osc_rdma_send_header_t; +typedef struct ompi_osc_rdma_header_base_t ompi_osc_rdma_header_base_t; -#define OMPI_OSC_RDMA_SEND_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_windx = htons((hdr).hdr_windx); \ - (hdr).hdr_origin = htonl((hdr).hdr_origin); \ - (hdr).hdr_origin_tag = htonl((hdr).hdr_origin_tag); \ - (hdr).hdr_target_disp = hton64((hdr).hdr_target_disp); \ - (hdr).hdr_target_count = htonl((hdr).hdr_target_count); \ - (hdr).hdr_target_op = htonl((hdr).hdr_target_op); \ - (hdr).hdr_msg_length = htonl((hdr).hdr_msg_length); \ - } while (0) +struct ompi_osc_rdma_header_put_t { + ompi_osc_rdma_header_base_t base; -#define OMPI_OSC_RDMA_SEND_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ - (hdr).hdr_origin = ntohl((hdr).hdr_origin); \ - (hdr).hdr_origin_tag = ntohl((hdr).hdr_origin_tag); \ - (hdr).hdr_target_disp = ntoh64((hdr).hdr_target_disp); \ - (hdr).hdr_target_count = ntohl((hdr).hdr_target_count); \ - (hdr).hdr_target_op = ntohl((hdr).hdr_target_op); \ - (hdr).hdr_msg_length = ntohl((hdr).hdr_msg_length); \ - } while (0) - - -struct ompi_osc_rdma_reply_header_t { - ompi_osc_rdma_base_header_t hdr_base; - - ompi_ptr_t hdr_origin_sendreq; - - int32_t hdr_target_tag; - int32_t hdr_msg_length; + uint16_t tag; + uint32_t count; + uint64_t len; + uint64_t displacement; }; -typedef struct ompi_osc_rdma_reply_header_t ompi_osc_rdma_reply_header_t; +typedef struct ompi_osc_rdma_header_put_t ompi_osc_rdma_header_put_t; -#define OMPI_OSC_RDMA_REPLY_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base) \ - (hdr).hdr_target_tag = htonl((hdr).hdr_target_tag); \ - (hdr).hdr_msg_length = htonl((hdr).hdr_msg_length); \ - } while (0) +struct ompi_osc_rdma_header_acc_t { + ompi_osc_rdma_header_base_t base; -#define OMPI_OSC_RDMA_REPLY_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base) \ - (hdr).hdr_target_tag = ntohl((hdr).hdr_target_tag); \ - (hdr).hdr_msg_length = ntohl((hdr).hdr_msg_length); \ - } while (0) - - -struct ompi_osc_rdma_control_header_t { - ompi_osc_rdma_base_header_t hdr_base; - int16_t hdr_windx; - int32_t hdr_value[2]; + uint16_t tag; + uint32_t count; + uint64_t len; + uint64_t displacement; + uint32_t op; }; -typedef struct ompi_osc_rdma_control_header_t ompi_osc_rdma_control_header_t; +typedef struct ompi_osc_rdma_header_acc_t ompi_osc_rdma_header_acc_t; -#define OMPI_OSC_RDMA_CONTROL_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \ - (hdr).hdr_windx = htons((hdr).hdr_windx); \ - (hdr).hdr_value[0] = htonl((hdr).hdr_value[0]); \ - (hdr).hdr_value[1] = htonl((hdr).hdr_value[1]); \ - } while (0) +struct ompi_osc_rdma_header_get_t { + ompi_osc_rdma_header_base_t base; -#define OMPI_OSC_RDMA_CONTROL_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \ - (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ - (hdr).hdr_value[0] = ntohl((hdr).hdr_value[0]); \ - (hdr).hdr_value[1] = ntohl((hdr).hdr_value[1]); \ - } while (0) - - -struct ompi_osc_rdma_rdma_info_header_t { - ompi_osc_rdma_base_header_t hdr_base; - int16_t hdr_windx; - int32_t hdr_origin; + uint16_t tag; + uint32_t count; + uint64_t len; + uint64_t displacement; }; -typedef struct ompi_osc_rdma_rdma_info_header_t ompi_osc_rdma_rdma_info_header_t; +typedef struct ompi_osc_rdma_header_get_t ompi_osc_rdma_header_get_t; -#define OMPI_OSC_RDMA_RDMA_INFO_HDR_HTON(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_HTON((hdr).hdr_base); \ - (hdr).hdr_windx = htons((hdr).hdr_windx); \ - (hdr).hdr_origin = htonl((hdr).hdr_origin); \ - } while (0) +struct ompi_osc_rdma_header_complete_t { + ompi_osc_rdma_header_base_t base; + int frag_count; +}; +typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t; -#define OMPI_OSC_RDMA_RDMA_INFO_HDR_NTOH(hdr) \ - do { \ - OMPI_OSC_RDMA_BASE_HDR_NTOH((hdr).hdr_base); \ - (hdr).hdr_windx = ntohs((hdr).hdr_windx); \ - (hdr).hdr_origin = ntohl((hdr).hdr_origin); \ - } while (0) +struct ompi_osc_rdma_header_get_acc_t { + ompi_osc_rdma_header_base_t base; + int16_t tag; + uint32_t count; + uint64_t len; + uint64_t displacement; + uint32_t op; +}; +typedef struct ompi_osc_rdma_header_get_acc_t ompi_osc_rdma_header_get_acc_t; + +struct ompi_osc_rdma_header_cswap_t { + ompi_osc_rdma_header_base_t base; + + int16_t tag; + + uint32_t len; + uint64_t displacement; +}; +typedef struct ompi_osc_rdma_header_cswap_t ompi_osc_rdma_header_cswap_t; + +struct ompi_osc_rdma_header_post_t { + ompi_osc_rdma_header_base_t base; + uint16_t windx; +}; +typedef struct ompi_osc_rdma_header_post_t ompi_osc_rdma_header_post_t; + +struct ompi_osc_rdma_header_lock_t { + ompi_osc_rdma_header_base_t base; + int32_t lock_type; + uint64_t serial_number; +}; +typedef struct ompi_osc_rdma_header_lock_t ompi_osc_rdma_header_lock_t; + +struct ompi_osc_rdma_header_lock_ack_t { + ompi_osc_rdma_header_base_t base; + uint16_t windx; + uint32_t source; + uint64_t serial_number; +}; +typedef struct ompi_osc_rdma_header_lock_ack_t ompi_osc_rdma_header_lock_ack_t; + +struct ompi_osc_rdma_header_unlock_t { + ompi_osc_rdma_header_base_t base; + int32_t lock_type; + uint32_t frag_count; +}; +typedef struct ompi_osc_rdma_header_unlock_t ompi_osc_rdma_header_unlock_t; + +struct ompi_osc_rdma_header_unlock_ack_t { + ompi_osc_rdma_header_base_t base; +}; +typedef struct ompi_osc_rdma_header_unlock_ack_t ompi_osc_rdma_header_unlock_ack_t; + +struct ompi_osc_rdma_header_flush_t { + ompi_osc_rdma_header_base_t base; + uint32_t frag_count; + uint64_t serial_number; +}; +typedef struct ompi_osc_rdma_header_flush_t ompi_osc_rdma_header_flush_t; + +struct ompi_osc_rdma_header_flush_ack_t { + ompi_osc_rdma_header_base_t base; + uint64_t serial_number; +}; +typedef struct ompi_osc_rdma_header_flush_ack_t ompi_osc_rdma_header_flush_ack_t; + +struct ompi_osc_rdma_frag_header_t { + ompi_osc_rdma_header_base_t base; + uint16_t windx; /* cid of communicator backing window (our window id) */ + uint32_t source; /* rank in window of source process */ + uint16_t num_ops; /* number of operations in this buffer */ +}; +typedef struct ompi_osc_rdma_frag_header_t ompi_osc_rdma_frag_header_t; + +union ompi_osc_rdma_header_t { + ompi_osc_rdma_header_base_t base; + ompi_osc_rdma_header_put_t put; + ompi_osc_rdma_header_acc_t acc; + ompi_osc_rdma_header_get_t get; + ompi_osc_rdma_header_complete_t complete; + ompi_osc_rdma_header_get_acc_t get_acc; + ompi_osc_rdma_header_cswap_t cswap; + ompi_osc_rdma_header_post_t post; + ompi_osc_rdma_header_lock_t lock; + ompi_osc_rdma_header_lock_ack_t lock_ack; + ompi_osc_rdma_header_unlock_t unlock; + ompi_osc_rdma_header_unlock_ack_t unlock_ack; + ompi_osc_rdma_header_flush_t flush; + ompi_osc_rdma_header_flush_ack_t flush_ack; + ompi_osc_rdma_frag_header_t frag; +}; +typedef union ompi_osc_rdma_header_t ompi_osc_rdma_header_t; #endif /* OMPI_MCA_OSC_RDMA_HDR_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_longreq.c b/ompi/mca/osc/rdma/osc_rdma_longreq.c deleted file mode 100644 index 2d183e7c2c..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_longreq.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_rdma_longreq.h" - - - -OBJ_CLASS_INSTANCE(ompi_osc_rdma_longreq_t, opal_free_list_item_t, - NULL, NULL); - diff --git a/ompi/mca/osc/rdma/osc_rdma_longreq.h b/ompi/mca/osc/rdma/osc_rdma_longreq.h deleted file mode 100644 index 8137e0e06a..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_longreq.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OSC_RDMA_LONGREQ_H -#define OSC_RDMA_LONGREQ_H - -#include "osc_rdma.h" - -#include "opal/class/opal_free_list.h" -#include "ompi/datatype/ompi_datatype.h" -#include "ompi/request/request.h" -#include "ompi/op/op.h" - -struct ompi_osc_rdma_longreq_t { - opal_free_list_item_t super; - ompi_request_t *request; - - union { - struct ompi_osc_rdma_sendreq_t *req_sendreq; - struct ompi_osc_rdma_replyreq_t *req_replyreq; - struct ompi_osc_rdma_send_header_t *req_sendhdr; - } req_basereq; - - /* warning - this doesn't always have a sane value */ - ompi_osc_rdma_module_t *req_module; - - /* for long receives, to avoid a longrecvreq type */ - struct ompi_op_t *req_op; - struct ompi_datatype_t *req_datatype; -}; -typedef struct ompi_osc_rdma_longreq_t ompi_osc_rdma_longreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_rdma_longreq_t); - -static inline int -ompi_osc_rdma_longreq_alloc(ompi_osc_rdma_longreq_t **longreq) -{ - opal_free_list_item_t *item; - int ret; - - OPAL_FREE_LIST_GET(&mca_osc_rdma_component.c_longreqs, - item, ret); - - *longreq = (ompi_osc_rdma_longreq_t*) item; - return ret; -} - -static inline int -ompi_osc_rdma_longreq_free(ompi_osc_rdma_longreq_t *longreq) -{ - OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.c_longreqs, - &longreq->super.super); - return OMPI_SUCCESS; -} - -#endif diff --git a/ompi/mca/osc/rdma/osc_rdma_obj_convert.h b/ompi/mca/osc/rdma/osc_rdma_obj_convert.h index 68315fb954..8b32d94ad2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_obj_convert.h +++ b/ompi/mca/osc/rdma/osc_rdma_obj_convert.h @@ -10,6 +10,7 @@ * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,11 +32,11 @@ ompi_osc_rdma_windx_to_module(uint32_t windx) ompi_osc_rdma_module_t *module; /* find the right module and dispatch */ - OPAL_THREAD_LOCK(&mca_osc_rdma_component.c_lock); - ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.c_modules, + OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock); + ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.modules, windx, (void**) (&module)); - OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.c_lock); + OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock); if (OMPI_SUCCESS != ret) { opal_output(0, "Could not translate windx %d to a local MPI_Win instance", windx); diff --git a/ompi/mca/osc/rdma/osc_rdma_passive_target.c b/ompi/mca/osc/rdma/osc_rdma_passive_target.c new file mode 100644 index 0000000000..973aa79424 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_passive_target.c @@ -0,0 +1,919 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "osc_rdma.h" +#include "osc_rdma_header.h" +#include "osc_rdma_data_move.h" +#include "osc_rdma_frag.h" + +#include "mpi.h" +#include "opal/runtime/opal_progress.h" +#include "opal/threads/mutex.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/osc/base/base.h" +#include "opal/include/opal_stdint.h" + +/* target-side tracking of a lock request */ +struct ompi_osc_rdma_pending_lock_t { + opal_list_item_t super; + int peer; + int lock_type; + uint64_t serial_number; +}; +typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t; +OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t, + NULL, NULL); + + +/* origin-side tracking of a lock request */ +struct ompi_osc_rdma_outstanding_lock_t { + opal_list_item_t super; + int target; + int32_t lock_acks_received; + int32_t unlock_acks_received; + int32_t flush_acks_received; + uint64_t serial_number; + int32_t type; +}; +typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t; +OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t, + NULL, NULL); + +static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module); +static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor, + int lock_type, uint64_t serial_number); + +/** + * Find the first outstanding lock to a target. + * + * @param[in] module - OSC RDMA module + * @param[in] target - Target rank + * + * @returns an outstanding lock on success + * + * This function traverses the outstanding_locks list in the module + * looking for a lock that matches target. The caller must hold the + * module lock. + */ +static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target) +{ + ompi_osc_rdma_outstanding_lock_t *lock; + + OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) { + if (lock->target == target) { + return lock; + } + } + + return NULL; +} + +static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number) +{ + ompi_osc_rdma_outstanding_lock_t *lock; + + OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) { + if (lock->serial_number == serial_number) { + return lock; + } + } + + return NULL; +} + +static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock) +{ + const int my_rank = ompi_comm_rank (module->comm); + + if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) || + (MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) { + /* we can aquire the lock immediately */ + module->lock_status = lock->type; + if (MPI_LOCK_SHARED == lock->type) { + module->shared_count++; + } + + lock->lock_acks_received = 1; + } else { + /* queue the lock */ + queue_lock (module, my_rank, lock->type, lock->serial_number); + } + + /* If locking local, can't be non-blocking according to the + standard. We need to wait for the ack here. */ + while (0 == lock->lock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "local lock aquired")); + + return OMPI_SUCCESS; +} + +static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock) +{ + if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) { + module->lock_status = 0; + ompi_osc_activate_next_lock (module); + } + + /* need to ensure we make progress */ + opal_progress(); + + lock->unlock_acks_received++; +} + +static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock) +{ + ompi_osc_rdma_header_lock_t lock_req; + int ret; + + /* generate a lock request */ + lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ; + lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET; + lock_req.lock_type = lock->type; + lock_req.serial_number = lock->serial_number; + + ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + /* make sure the request gets sent, so we can start eager sending... */ + ret = ompi_osc_rdma_frag_flush_target (module, target); + + return ret; +} + +static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock) +{ + ompi_osc_rdma_header_unlock_t unlock_req; + + unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ; + unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET; + unlock_req.frag_count = module->epoch_outgoing_frag_count[target]; + unlock_req.lock_type = lock->type; + + /* send control message with unlock request and count */ + return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req)); +} + + + +int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_outstanding_lock_t *lock; + int ret = OMPI_SUCCESS; + + assert(module->epoch_outgoing_frag_count[target] == 0); + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: lock %d %d", target, lock_type)); + + /* delay all eager sends until we've heard back.. */ + OPAL_THREAD_LOCK(&module->lock); + module->passive_eager_send_active[target] = false; + module->passive_target_access_epoch = true; + + /* create lock item */ + lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t); + if (OPAL_UNLIKELY(NULL == lock)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + lock->target = target; + lock->lock_acks_received = 0; + lock->unlock_acks_received = 0; + lock->serial_number = module->lock_serial_number++; + lock->type = lock_type; + opal_list_append(&module->outstanding_locks, &lock->super); + + if (0 == (assert & MPI_MODE_NOCHECK)) { + if (ompi_comm_rank (module->comm) != target) { + ret = ompi_osc_rdma_lock_remote (module, target, lock); + } else { + ret = ompi_osc_rdma_lock_self (module, lock); + } + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto exit_error; + } + } else { + lock->lock_acks_received = 1; + } + + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; + +exit_error: + + OPAL_THREAD_UNLOCK(&module->lock); + opal_list_remove_item(&module->outstanding_locks, &lock->super); + OBJ_RELEASE(lock); + + /* return */ + return ret; +} + + +int ompi_osc_rdma_unlock(int target, ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_outstanding_lock_t *lock = NULL; + int ret = OMPI_SUCCESS; + + OPAL_THREAD_LOCK(&module->lock); + + lock = find_outstanding_lock (module, target); + if (OPAL_UNLIKELY(NULL == lock)) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_unlock: target %d is not locked in window %s", + target, win->w_name)); + OPAL_THREAD_LOCK(&module->lock); + return MPI_ERR_RMA_SYNC; + } + + if (ompi_comm_rank (module->comm) != target) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: unlock %d, lock_acks_received = %d", target, + lock->lock_acks_received)); + + /* wait until ack has arrived from target */ + while (0 == lock->lock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + ret = ompi_osc_rdma_unlock_remote (module, target, lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto cleanup; + } + + /* start all sendreqs to target */ + ret = ompi_osc_rdma_frag_flush_target(module, target); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto cleanup; + } + + /* wait for all the requests and the unlock ack (meaning remote completion) */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count || + 0 == lock->unlock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_unlock: unlock of %d complete", target)); + } else { + ompi_osc_rdma_unlock_self (module, lock); + } + + module->passive_eager_send_active[target] = false; + module->epoch_outgoing_frag_count[target] = 0; + module->passive_target_access_epoch = false; + + /* delete the lock */ + opal_list_remove_item (&module->outstanding_locks, &lock->super); + OBJ_RELEASE(lock); + + cleanup: + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + int ret, my_rank = ompi_comm_rank (module->comm); + ompi_osc_rdma_outstanding_lock_t *lock; + + /* delay all eager sends until we've heard back.. */ + OPAL_THREAD_LOCK(&module->lock); + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + module->passive_eager_send_active[i] = false; + } + module->passive_target_access_epoch = true; + + /* create lock item */ + lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t); + lock->target = -1; + lock->lock_acks_received = 0; + lock->unlock_acks_received = 0; + lock->serial_number = module->lock_serial_number++; + opal_list_append(&module->outstanding_locks, &lock->super); + + /* if nocheck is not specified, send a lock request to everyone + and wait for the local response */ + if (0 != (assert & MPI_MODE_NOCHECK)) { + ret = ompi_osc_rdma_lock_self (module, lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto exit_error; + } + + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (my_rank == i) { + continue; + } + + ret = ompi_osc_rdma_lock_remote (module, i, lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + opal_list_remove_item(&module->outstanding_locks, &lock->super); + } + } + } else { + lock->lock_acks_received = ompi_comm_size(module->comm); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + return OMPI_SUCCESS; + + exit_error: + + OPAL_THREAD_UNLOCK(&module->lock); + opal_list_remove_item(&module->outstanding_locks, &lock->super); + OBJ_RELEASE(lock); + + /* return */ + return ret; +} + + +int ompi_osc_rdma_unlock_all (struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + int my_rank = ompi_comm_rank (module->comm); + ompi_osc_rdma_outstanding_lock_t *lock; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_unlock_all entering...")); + + OPAL_THREAD_LOCK(&module->lock); + + lock = find_outstanding_lock (module, -1); + if (OPAL_UNLIKELY(NULL == lock)) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_unlock_all: not locked in window %s", + win->w_name)); + OPAL_THREAD_LOCK(&module->lock); + return MPI_ERR_RMA_SYNC; + } + + /* wait for lock acks */ + while (ompi_comm_size(module->comm) != lock->lock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + /* send unlock messages to all of my peers */ + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (my_rank == i) { + continue; + } + + ret = ompi_osc_rdma_unlock_remote (module, i, lock); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + goto cleanup; + } + } + + /* unlock myself */ + ompi_osc_rdma_unlock_self (module, lock); + + /* start all sendreqs to target */ + ret = ompi_osc_rdma_frag_flush_all(module); + if (OMPI_SUCCESS != ret) goto cleanup; + + /* wait for all the requests and the unlock ack (meaning remote completion) */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count || + ompi_comm_size(module->comm) != lock->unlock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + /* reset all fragment counters */ + memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0])); + memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * module->passive_eager_send_active[0]); + + opal_list_remove_item (&module->outstanding_locks, &lock->super); + OBJ_RELEASE(lock); + + module->passive_target_access_epoch = false; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_unlock_all complete")); + + cleanup: + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_sync (struct ompi_win_t *win) +{ + opal_progress(); + return OMPI_SUCCESS; +} + +static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock, + int target) +{ + ompi_osc_rdma_header_flush_t flush_req; + int peer_count, ret, flush_count; + int my_rank = ompi_comm_rank (module->comm); + + if (-1 == lock->target) { + peer_count = ompi_comm_size(module->comm) - 1; + } else { + peer_count = 1; + } + + /* wait until ack has arrived from target, since we need to be + able to eager send before we can transfer all the data... */ + while (peer_count > lock->lock_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + lock->flush_acks_received = 0; + + flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ; + flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET; + flush_req.serial_number = lock->serial_number; + + if (-1 == target) { + flush_count = ompi_comm_size(module->comm); + for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (i == my_rank) { + continue; + } + + flush_req.frag_count = module->epoch_outgoing_frag_count[i]; + + /* send control message with flush request and count */ + ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + } else { + flush_req.frag_count = module->epoch_outgoing_frag_count[target]; + flush_count = 1; + /* send control message with flush request and count */ + ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + + /* start all sendreqs to target */ + ret = ompi_osc_rdma_frag_flush_all (module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + /* wait for all the requests and the flush ack (meaning remote completion) */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count || + flush_count != lock->flush_acks_received) { + opal_condition_wait(&module->cond, &module->lock); + } + + if (-1 == target) { + memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0])); + } else { + module->epoch_outgoing_frag_count[target] = 0; + } + + return OMPI_SUCCESS; +} + +int ompi_osc_rdma_flush (int target, struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_outstanding_lock_t *lock; + int ret; + + assert (0 <= target); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_flush starting...")); + + if (ompi_comm_rank (module->comm) == target) { + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "calling opal_progress. incoming complete = %d", + module->frag_request->req_complete)); + opal_progress (); + return OMPI_SUCCESS; + } + + OPAL_THREAD_LOCK(&module->lock); + + lock = find_outstanding_lock (module, target); + if (NULL == lock) { + lock = find_outstanding_lock (module, -1); + } + if (OPAL_UNLIKELY(NULL == lock)) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_flush: target %d is not locked in window %s", + target, win->w_name)); + OPAL_THREAD_LOCK(&module->lock); + return MPI_ERR_RMA_SYNC; + } + + ret = ompi_osc_rdma_flush_lock (module, lock, target); + + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_flush_all (struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + ompi_osc_rdma_outstanding_lock_t *lock; + int ret = OMPI_SUCCESS; + + if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_flush_all: no targets are locked in window %s", + win->w_name)); + return MPI_ERR_RMA_SYNC; + } + + OPAL_THREAD_LOCK(&module->lock); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_flush_all entering...")); + + /* flush all locks */ + OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) { + ret = ompi_osc_rdma_flush_lock (module, lock, lock->target); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + break; + } + } + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_flush_all complete")); + + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + int ret; + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_flush_target(module, target); + if (OMPI_SUCCESS != ret) goto cleanup; + + /* wait for all the requests */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count) { + opal_condition_wait(&module->cond, &module->lock); + } + + cleanup: + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + + +int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win) +{ + ompi_osc_rdma_module_t *module = GET_MODULE(win); + int ret = OMPI_SUCCESS; + + OPAL_THREAD_LOCK(&module->lock); + + ret = ompi_osc_rdma_frag_flush_all(module); + if (OMPI_SUCCESS != ret) goto cleanup; + + /* wait for all the requests */ + while (module->outgoing_frag_count != module->outgoing_frag_signal_count) { + opal_condition_wait(&module->cond, &module->lock); + } + + cleanup: + OPAL_THREAD_UNLOCK(&module->lock); + + return ret; +} + +/* target side operation to acknowledge to initiator side that the + lock is now held by the initiator */ +static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor, + uint64_t serial_number) +{ + ompi_osc_rdma_outstanding_lock_t *lock; + + if (ompi_comm_rank (module->comm) != requestor) { + ompi_osc_rdma_header_lock_ack_t lock_ack; + + lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK; + lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + lock_ack.source = ompi_comm_rank(module->comm); + lock_ack.windx = ompi_comm_get_cid(module->comm); + lock_ack.serial_number = serial_number; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: sending lock to %d", requestor)); + + /* we don't want to send any data, since we're the exposure + epoch only, so use an unbuffered send */ + return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack)); + } + + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: releasing local lock")); + + lock = find_outstanding_lock (module, requestor); + if (NULL == lock) { + lock = find_outstanding_lock (module, -1); + if (OPAL_UNLIKELY(NULL == lock)) { + OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, + "lock could not be located")); + } + } + + lock->lock_acks_received = 1; + opal_condition_broadcast (&module->cond); + + return OMPI_SUCCESS; +} + + +/* target side operation to create a pending lock request for a lock + request that could not be satisfied */ +static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor, + int lock_type, uint64_t serial_number) +{ + ompi_osc_rdma_pending_lock_t *pending = + OBJ_NEW(ompi_osc_rdma_pending_lock_t); + if (NULL == pending) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + pending->peer = requestor; + pending->lock_type = lock_type; + pending->serial_number = serial_number; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: queueing lock request from %d", requestor)); + + opal_list_append(&module->locks_pending, &pending->super); + + return OMPI_SUCCESS; +} + +static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) { + /* release any other pending locks we can */ + ompi_osc_rdma_pending_lock_t *pending_lock, *next; + int ret = OMPI_SUCCESS; + + OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending, + ompi_osc_rdma_pending_lock_t) { + if (MPI_LOCK_SHARED == pending_lock->lock_type) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n", + pending_lock->peer)); + /* acquire shared lock */ + module->lock_status = MPI_LOCK_SHARED; + module->shared_count++; + ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number); + + opal_list_remove_item (&module->locks_pending, &pending_lock->super); + OBJ_RELEASE(pending_lock); + } else { + if (0 == module->lock_status) { + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n", + pending_lock->peer)); + /* acquire exclusive lock */ + module->lock_status = MPI_LOCK_EXCLUSIVE; + ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number); + opal_list_remove_item (&module->locks_pending, &pending_lock->super); + OBJ_RELEASE(pending_lock); + } + /* if the lock was acquired (ie, status was 0), then + we're done. If the lock was not acquired, we're + also done, because all the shared locks have to + finish first */ + break; + } + + if (OMPI_SUCCESS != ret) { + break; + } + } + + return ret; +} + + +/* target side function called when the initiator sends a lock + request. Lock will either be activated and acknowledged or + queued. */ +int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source, + ompi_osc_rdma_header_lock_t* lock_header) +{ + int ret; + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d", + source, module->lock_status, module->shared_count)); + + if (MPI_LOCK_SHARED == lock_header->lock_type) { + if (module->lock_status != MPI_LOCK_EXCLUSIVE) { + /* acquire shared lock */ + module->lock_status = MPI_LOCK_SHARED; + module->shared_count++; + ret = activate_lock(module, source, lock_header->serial_number); + } else { + /* lock not available, queue */ + ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number); + } + } else { + if (0 == module->lock_status) { + /* acquire exclusive lock */ + module->lock_status = MPI_LOCK_EXCLUSIVE; + ret = activate_lock(module, source, lock_header->serial_number); + } else { + /* lock not available, queue */ + ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number); + } + } + + return ret; +} + + +/* initiator-side function called when the target acks the lock + request. */ +void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module, + ompi_osc_rdma_header_lock_ack_t *lock_ack_header) +{ + ompi_osc_rdma_outstanding_lock_t *lock, *next; + + OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) { + if (lock->serial_number == lock_ack_header->serial_number) { + + OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, + "osc rdma: lock ack %d", lock_ack_header->source)); + + lock->lock_acks_received++; + module->passive_eager_send_active[lock_ack_header->source] = true; + return; + } + } + + opal_output(ompi_osc_base_framework.framework_output, + "osc rdma: lock ack %d, %ld for unfindable lock request", + lock_ack_header->source, (unsigned long) lock_ack_header->serial_number); +} + +void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_flush_ack_t *flush_ack_header) { + ompi_osc_rdma_outstanding_lock_t *lock; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64, + source, flush_ack_header->serial_number)); + + /* NTH: need to verify that this will work as expected */ + lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number); + assert (NULL != lock); + + lock->flush_acks_received++; + + opal_condition_broadcast(&module->cond); +} + +void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) { + ompi_osc_rdma_outstanding_lock_t *lock; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d", + source)); + + /* NTH: need to verify that this will work as expected */ + lock = find_outstanding_lock (module, source); + if (NULL == lock) { + lock = find_outstanding_lock(module, -1); + assert (NULL != lock); + } + + lock->unlock_acks_received++; +} + +/** + * Process an unlock request. + * + * @param[in] module - OSC RDMA module + * @param[in] source - Source rank + * @param[in] unlock_header - Incomming unlock header + * + * This functions is the target-side functio for handling an unlock + * request. Once all pending operations from the target are complete + * this functions sends an unlock acknowledgement then attempts to + * active a pending lock if the lock becomes free. + */ +int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_unlock_t *unlock_header) +{ + ompi_osc_rdma_header_unlock_ack_t unlock_ack; + int ret; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_process_unlock entering (finished %d/%d)...", + module->passive_incoming_frag_count[source], + module->passive_incoming_frag_signal_count[source])); + + /* we cannot block when processing an incomming request */ + if (module->passive_incoming_frag_signal_count[source] != + module->passive_incoming_frag_count[source]) { + return OMPI_ERR_WOULD_BLOCK; + } + + unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK; + unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + + ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + module->passive_incoming_frag_signal_count[source] = 0; + module->passive_incoming_frag_count[source] = 0; + + OPAL_THREAD_LOCK(&module->lock); + + if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) { + module->lock_status = 0; + + ompi_osc_activate_next_lock (module); + } + + OPAL_THREAD_UNLOCK(&module->lock); + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "osc rdma: finished processing unlock fragment")); + + return ret; +} + +int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source, + ompi_osc_rdma_header_flush_t *flush_header) +{ + ompi_osc_rdma_header_flush_ack_t flush_ack; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "ompi_osc_rdma_process_flush entering (finished %d/%d)...", + module->passive_incoming_frag_count[source], + module->passive_incoming_frag_signal_count[source])); + + /* we cannot block when processing an incomming request */ + if (module->passive_incoming_frag_signal_count[source] != + module->passive_incoming_frag_count[source]) { + return OMPI_ERR_WOULD_BLOCK; + } + + module->passive_incoming_frag_signal_count[source] = 0; + module->passive_incoming_frag_count[source] = 0; + + flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK; + flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; + flush_ack.serial_number = flush_header->serial_number; + + return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack)); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_pending_frag.h b/ompi/mca/osc/rdma/osc_rdma_pending_frag.h new file mode 100644 index 0000000000..cc282c7d7c --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_pending_frag.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * Pending frags are fragments that have been received on the target, + * but can not yet be processed (because ordering is turned on). + * Because receive memory descriptors are precious resources, rather + * than keeping a descriptor until the right sequence number, we + * instead malloc a buffer (as part of the pending frag) and copy the + * message. + */ + +#ifndef OSC_RDMA_PENDING_FRAG_H +#define OSC_RDMA_PENDING_FRAG_H + +/** Incoming fragment that has to be queued */ +struct ompi_osc_rdma_pending_frag_t { + opal_list_item_t super; + + /* This is a pointer to the top of the fragment (which is always + the header). Save as a header to make the casting a bit less + onerous during sequence number lookups. */ + ompi_osc_rdma_frag_header_t *header; +}; +typedef struct ompi_osc_rdma_pending_frag_t ompi_osc_rdma_pending_frag_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_frag_t); + +/* + * Note: module lock must be held during this operation + */ +static inline ompi_osc_rdma_pending_frag_t* +ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module, + void *ptr, + size_t size) +{ + size_t total_size = sizeof(ompi_osc_rdma_pending_frag_t) + size; + ompi_osc_rdma_pending_frag_t *ret = + (ompi_osc_rdma_pending_frag_t*) malloc(total_size); + if (NULL == ret) return NULL; + + OBJ_CONSTRUCT(&ret, ompi_osc_rdma_pending_frag_t); + memcpy(ret->header, ptr, size); + + return ret; +} + + +/* + * Note: module lock must be held for this operation + */ +static inline int +ompi_osc_rdma_pending_frag_destroy(ompi_osc_rdma_module_t *module, + ompi_osc_rdma_pending_frag_t* frag) +{ + OBJ_DESTRUCT(&frag); + free(frag); + + return OMPI_SUCCESS; +} + +#endif diff --git a/ompi/mca/osc/rdma/osc_rdma_replyreq.c b/ompi/mca/osc/rdma/osc_rdma_replyreq.c deleted file mode 100644 index 1cf6464354..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_replyreq.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_rdma_replyreq.h" - -#include "opal/class/opal_list.h" -#include "opal/datatype/opal_convertor.h" - -int -ompi_osc_rdma_replyreq_alloc_init(ompi_osc_rdma_module_t *module, - int origin, - ompi_ptr_t origin_request, - OPAL_PTRDIFF_TYPE target_displacement, - int target_count, - struct ompi_datatype_t *datatype, - ompi_osc_rdma_replyreq_t **replyreq) -{ - int ret; - void *target_addr = (unsigned char*) module->m_win->w_baseptr + - (target_displacement * module->m_win->w_disp_unit); - - - /* allocate a replyreq */ - ret = ompi_osc_rdma_replyreq_alloc(module, - origin, - replyreq); - if (OMPI_SUCCESS != ret) return ret; - - /* initialize local side of replyreq */ - ret = ompi_osc_rdma_replyreq_init_target(*replyreq, - target_addr, - target_count, - datatype); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_replyreq_free(*replyreq); - return ret; - } - - /* initialize remote side of replyreq */ - ret = ompi_osc_rdma_replyreq_init_origin(*replyreq, - origin_request); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_replyreq_free(*replyreq); - return ret; - } - - return OMPI_SUCCESS; -} - - -static void ompi_osc_rdma_replyreq_construct(ompi_osc_rdma_replyreq_t *replyreq) -{ - OBJ_CONSTRUCT(&(replyreq->rep_target_convertor), opal_convertor_t); -} - -static void ompi_osc_rdma_replyreq_destruct(ompi_osc_rdma_replyreq_t *replyreq) -{ - OBJ_DESTRUCT(&(replyreq->rep_target_convertor)); -} - - -OBJ_CLASS_INSTANCE(ompi_osc_rdma_replyreq_t, opal_list_item_t, - ompi_osc_rdma_replyreq_construct, - ompi_osc_rdma_replyreq_destruct); diff --git a/ompi/mca/osc/rdma/osc_rdma_replyreq.h b/ompi/mca/osc/rdma/osc_rdma_replyreq.h deleted file mode 100644 index 9dd39dea45..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_replyreq.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_RDMA_REPLYREQ_H -#define OMPI_OSC_RDMA_REPLYREQ_H - -#include "osc_rdma.h" -#include "osc_rdma_longreq.h" - -#include "opal/class/opal_list.h" -#include "ompi/datatype/ompi_datatype.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/communicator/communicator.h" -#include "ompi/proc/proc.h" -#include "ompi/memchecker.h" - - -struct ompi_osc_rdma_replyreq_t { - opal_list_item_t super; - - /** pointer to the module that created the replyreq */ - ompi_osc_rdma_module_t *rep_module; - - /** Datatype for the target side of the operation */ - struct ompi_datatype_t *rep_target_datatype; - /** Convertor for the target. Always setup for send. */ - opal_convertor_t rep_target_convertor; - /** packed size of message on the target side */ - size_t rep_target_bytes_packed; - - /** rank in module's communicator for origin of operation */ - int rep_origin_rank; - /** pointer to the proc structure for the origin of the operation */ - ompi_proc_t *rep_origin_proc; - - ompi_ptr_t rep_origin_sendreq; -}; -typedef struct ompi_osc_rdma_replyreq_t ompi_osc_rdma_replyreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_rdma_replyreq_t); - - -/** allocate and populate a replyreq structure. datatype is - RETAINed for the life of the replyreq */ -int -ompi_osc_rdma_replyreq_alloc_init(ompi_osc_rdma_module_t *module, - int origin, - ompi_ptr_t origin_request, - OPAL_PTRDIFF_TYPE target_displacement, - int target_count, - struct ompi_datatype_t *datatype, - ompi_osc_rdma_replyreq_t **replyreq); - - -static inline int -ompi_osc_rdma_replyreq_alloc(ompi_osc_rdma_module_t *module, - int origin_rank, - ompi_osc_rdma_replyreq_t **replyreq) -{ - int ret; - opal_free_list_item_t *item; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, origin_rank ); - - /* BWB - FIX ME - is this really the right return code? */ - if (NULL == proc) return OMPI_ERR_OUT_OF_RESOURCE; - - OPAL_FREE_LIST_GET(&mca_osc_rdma_component.c_replyreqs, - item, ret); - if (OMPI_SUCCESS != ret) return ret; - *replyreq = (ompi_osc_rdma_replyreq_t*) item; - - (*replyreq)->rep_module = module; - (*replyreq)->rep_origin_rank = origin_rank; - (*replyreq)->rep_origin_proc = proc; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_replyreq_init_target(ompi_osc_rdma_replyreq_t *replyreq, - void *target_addr, - int target_count, - struct ompi_datatype_t *target_dt) -{ - OBJ_RETAIN(target_dt); - replyreq->rep_target_datatype = target_dt; - - opal_convertor_copy_and_prepare_for_send(replyreq->rep_origin_proc->proc_convertor, - &(target_dt->super), - target_count, - target_addr, - 0, - &(replyreq->rep_target_convertor)); - opal_convertor_get_packed_size(&replyreq->rep_target_convertor, - &replyreq->rep_target_bytes_packed); - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_replyreq_init_origin(ompi_osc_rdma_replyreq_t *replyreq, - ompi_ptr_t origin_request) -{ - replyreq->rep_origin_sendreq = origin_request; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_replyreq_free(ompi_osc_rdma_replyreq_t *replyreq) -{ - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &replyreq->rep_target_convertor); - ); - opal_convertor_cleanup(&replyreq->rep_target_convertor); - - OBJ_RELEASE(replyreq->rep_target_datatype); - - OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.c_replyreqs, - (opal_list_item_t*) replyreq); - - return OMPI_SUCCESS; -} - -#endif /* OMPI_OSC_RDMA_REPLYREQ_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_request.c b/ompi/mca/osc/rdma/osc_rdma_request.c new file mode 100644 index 0000000000..1fc9b3f7f4 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_request.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/request/request.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_rdma.h" +#include "osc_rdma_request.h" + +static int +request_cancel(struct ompi_request_t *request, int complete) +{ + return MPI_ERR_REQUEST; +} + +static int +request_free(struct ompi_request_t **ompi_req) +{ + ompi_osc_rdma_request_t *request = + (ompi_osc_rdma_request_t*) *ompi_req; + + if (true != request->super.req_complete) { + return MPI_ERR_REQUEST; + } + + OMPI_OSC_RDMA_REQUEST_RETURN(request); + + *ompi_req = MPI_REQUEST_NULL; + + return OMPI_SUCCESS; +} + +static +void +request_construct(ompi_osc_rdma_request_t *request) +{ + request->super.req_type = OMPI_REQUEST_WIN; + request->super.req_status._cancelled = 0; + request->super.req_free = request_free; + request->super.req_cancel = request_cancel; +} + +OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t, + ompi_request_t, + request_construct, + NULL); diff --git a/ompi/mca/osc/rdma/osc_rdma_request.h b/ompi/mca/osc/rdma/osc_rdma_request.h new file mode 100644 index 0000000000..ebf17aeb31 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_request.h @@ -0,0 +1,74 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_OSC_RDMA_REQUEST_H +#define OMPI_OSC_RDMA_REQUEST_H + +#include "osc_rdma.h" + +#include "ompi/request/request.h" +#include "opal/util/output.h" + +struct ompi_osc_rdma_request_t { + ompi_request_t super; + + int type; + void *origin_addr; + int origin_count; + struct ompi_datatype_t *origin_dt; + ompi_osc_rdma_module_t* module; + int outstanding_requests; + bool internal; +}; +typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); + +/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput, + rdma_rget, etc.), so it's ok to spin here... */ +#define OMPI_OSC_RDMA_REQUEST_ALLOC(win, req) \ + do { \ + ompi_free_list_item_t *item; \ + do { \ + OMPI_FREE_LIST_GET_MT(&mca_osc_rdma_component.requests, item); \ + if (NULL == item) { \ + opal_progress(); \ + } \ + } while (NULL == item); \ + req = (ompi_osc_rdma_request_t*) item; \ + OMPI_REQUEST_INIT(&req->super, false); \ + req->super.req_mpi_object.win = win; \ + req->super.req_complete = false; \ + req->super.req_state = OMPI_REQUEST_ACTIVE; \ + req->module = GET_MODULE(win); \ + req->internal = false; \ + } while (0) + +#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \ + do { \ + OMPI_REQUEST_FINI(&(req)->super); \ + OMPI_FREE_LIST_RETURN_MT(&mca_osc_rdma_component.requests, \ + (ompi_free_list_item_t *) (req)); \ + } while (0) + +static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error) +{ + if (!request->internal) { + request->super.req_status.MPI_ERROR = mpi_error; + + /* mark the request complete at the mpi level */ + ompi_request_complete (&request->super, true); + } else { + OMPI_OSC_RDMA_REQUEST_RETURN (request); + } +} + +#endif /* OMPI_OSC_RDMA_REQUEST_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_sendreq.c b/ompi/mca/osc/rdma/osc_rdma_sendreq.c deleted file mode 100644 index c47a93c7f9..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_sendreq.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_rdma_sendreq.h" - -#include "opal/datatype/opal_convertor.h" - - -int -ompi_osc_rdma_sendreq_alloc_init(ompi_osc_rdma_req_type_t req_type, - void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_dt, - ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t **sendreq) -{ - int ret; - - /* allocate a sendreq */ - ret = ompi_osc_rdma_sendreq_alloc(module, target, - sendreq); - if (OMPI_SUCCESS != ret) return ret; - - /* initialize local side of sendreq */ - ret = ompi_osc_rdma_sendreq_init_origin(*sendreq, - req_type, - origin_addr, - origin_count, - origin_dt); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_sendreq_free(*sendreq); - return ret; - } - - /* initialize remote side of sendreq */ - ret = ompi_osc_rdma_sendreq_init_target(*sendreq, - target_disp, - target_count, - target_dt); - if (OMPI_SUCCESS != ret) { - ompi_osc_rdma_sendreq_free(*sendreq); - return ret; - } - - return OMPI_SUCCESS; -} - - -static void ompi_osc_rdma_sendreq_construct(ompi_osc_rdma_sendreq_t *req) -{ - req->super.req_type = OMPI_REQUEST_WIN; - req->super.req_free = NULL; - req->super.req_cancel = NULL; - OBJ_CONSTRUCT(&(req->req_origin_convertor), opal_convertor_t); -} - - -static void ompi_osc_rdma_sendreq_destruct(ompi_osc_rdma_sendreq_t *req) -{ - OBJ_DESTRUCT(&(req->req_origin_convertor)); -} - - -OBJ_CLASS_INSTANCE(ompi_osc_rdma_sendreq_t, ompi_request_t, - ompi_osc_rdma_sendreq_construct, - ompi_osc_rdma_sendreq_destruct); diff --git a/ompi/mca/osc/rdma/osc_rdma_sendreq.h b/ompi/mca/osc/rdma/osc_rdma_sendreq.h deleted file mode 100644 index f23a603119..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_sendreq.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OMPI_OSC_RDMA_SENDREQ_H -#define OMPI_OSC_RDMA_SENDREQ_H - -#include "osc_rdma.h" -#include "osc_rdma_longreq.h" - -#include "opal/class/opal_list.h" -#include "ompi/datatype/ompi_datatype.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/communicator/communicator.h" -#include "ompi/proc/proc.h" -#include "ompi/memchecker.h" - -typedef enum { - OMPI_OSC_RDMA_GET, - OMPI_OSC_RDMA_ACC, - OMPI_OSC_RDMA_PUT -} ompi_osc_rdma_req_type_t; - - -struct ompi_osc_rdma_sendreq_t { - ompi_request_t super; - - int req_refcount; - - /** type of sendreq (from ompi_osc_rdma_req_type_t) */ - ompi_osc_rdma_req_type_t req_type; - /** pointer to the module that created the sendreq */ - ompi_osc_rdma_module_t *req_module; - - /** Datatype for the origin side of the operation */ - struct ompi_datatype_t *req_origin_datatype; - /** Convertor for the origin side of the operation. Setup for - either send (Put / Accumulate) or receive (Get) */ - opal_convertor_t req_origin_convertor; - /** packed size of message on the origin side */ - size_t req_origin_bytes_packed; - - /** rank in module's communicator for target of operation */ - int req_target_rank; - /** pointer to the proc structure for the target of the operation */ - ompi_proc_t *req_target_proc; - - /** displacement on target */ - OPAL_PTRDIFF_TYPE req_target_disp; - /** datatype count on target */ - int req_target_count; - /** datatype on target */ - struct ompi_datatype_t *req_target_datatype; - - /** op index on the target */ - int req_op_id; - - uint8_t remote_segs[MCA_BTL_SEG_MAX_SIZE]; -}; -typedef struct ompi_osc_rdma_sendreq_t ompi_osc_rdma_sendreq_t; -OBJ_CLASS_DECLARATION(ompi_osc_rdma_sendreq_t); - - -/** allocate and populate a sendreq structure. Both datatypes are - RETAINed for the life of the sendreq */ -int -ompi_osc_rdma_sendreq_alloc_init(ompi_osc_rdma_req_type_t req_type, - void *origin_addr, int origin_count, - struct ompi_datatype_t *origin_dt, - int target, OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_datatype, - ompi_osc_rdma_module_t *module, - ompi_osc_rdma_sendreq_t **sendreq); - -static inline int -ompi_osc_rdma_sendreq_alloc(ompi_osc_rdma_module_t *module, - int target_rank, - ompi_osc_rdma_sendreq_t **sendreq) -{ - int ret; - opal_free_list_item_t *item; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, target_rank ); - - /* BWB - FIX ME - is this really the right return code? */ - if (NULL == proc) return OMPI_ERR_OUT_OF_RESOURCE; - - OPAL_FREE_LIST_GET(&mca_osc_rdma_component.c_sendreqs, - item, ret); - if (OMPI_SUCCESS != ret) return ret; - *sendreq = (ompi_osc_rdma_sendreq_t*) item; - - (*sendreq)->req_module = module; - (*sendreq)->req_target_rank = target_rank; - (*sendreq)->req_target_proc = proc; - (*sendreq)->req_refcount = 1; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_sendreq_init_origin(ompi_osc_rdma_sendreq_t *sendreq, - ompi_osc_rdma_req_type_t req_type, - void *origin_addr, - int origin_count, - struct ompi_datatype_t *origin_dt) -{ - OBJ_RETAIN(origin_dt); - sendreq->req_origin_datatype = origin_dt; - sendreq->req_type = req_type; - - if (req_type != OMPI_OSC_RDMA_GET) { - opal_convertor_copy_and_prepare_for_send(sendreq->req_target_proc->proc_convertor, - &(origin_dt->super), - origin_count, - origin_addr, - 0, - &(sendreq->req_origin_convertor)); - opal_convertor_get_packed_size(&sendreq->req_origin_convertor, - &sendreq->req_origin_bytes_packed); - } else { - opal_convertor_copy_and_prepare_for_recv(sendreq->req_target_proc->proc_convertor, - &(origin_dt->super), - origin_count, - origin_addr, - 0, - &(sendreq->req_origin_convertor)); - opal_convertor_get_packed_size(&sendreq->req_origin_convertor, - &sendreq->req_origin_bytes_packed); - } - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_sendreq_init_target(ompi_osc_rdma_sendreq_t *sendreq, - OPAL_PTRDIFF_TYPE target_disp, - int target_count, - struct ompi_datatype_t *target_datatype) -{ - OBJ_RETAIN(target_datatype); - - sendreq->req_target_disp = target_disp; - sendreq->req_target_count = target_count; - sendreq->req_target_datatype = target_datatype; - - return OMPI_SUCCESS; -} - - -static inline int -ompi_osc_rdma_sendreq_free(ompi_osc_rdma_sendreq_t *sendreq) -{ - if (0 == (--sendreq->req_refcount)) { - MEMCHECKER( - memchecker_convertor_call(&opal_memchecker_base_mem_defined, - &sendreq->req_origin_convertor); - ); - opal_convertor_cleanup(&sendreq->req_origin_convertor); - - OBJ_RELEASE(sendreq->req_target_datatype); - OBJ_RELEASE(sendreq->req_origin_datatype); - - OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.c_sendreqs, - (opal_list_item_t*) sendreq); - } - - return OMPI_SUCCESS; -} - -#endif /* OMPI_OSC_RDMA_SENDREQ_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_sync.c b/ompi/mca/osc/rdma/osc_rdma_sync.c deleted file mode 100644 index 21052aa3c9..0000000000 --- a/ompi/mca/osc/rdma/osc_rdma_sync.c +++ /dev/null @@ -1,788 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University. - * All rights reserved. - * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. - * All rights reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2010 IBM Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "osc_rdma.h" -#include "osc_rdma_sendreq.h" -#include "osc_rdma_longreq.h" -#include "osc_rdma_header.h" -#include "osc_rdma_data_move.h" - -#include "mpi.h" -#include "opal/runtime/opal_progress.h" -#include "opal/threads/mutex.h" -#include "ompi/communicator/communicator.h" -#include "ompi/mca/osc/base/base.h" - - -/* Must hold module's lock before calling... */ -static inline void -ompi_osc_rdma_flip_sendreqs(ompi_osc_rdma_module_t *module) -{ - unsigned int *tmp; - - tmp = module->m_copy_num_pending_sendreqs; - module->m_copy_num_pending_sendreqs = - module->m_num_pending_sendreqs; - module->m_num_pending_sendreqs = tmp; - memset(module->m_num_pending_sendreqs, 0, - sizeof(unsigned int) * ompi_comm_size(module->m_comm)); - - /* Copy in all the pending requests */ - opal_list_join(&module->m_copy_pending_sendreqs, - opal_list_get_end(&module->m_copy_pending_sendreqs), - &module->m_pending_sendreqs); -} - - -int -ompi_osc_rdma_module_fence(int assert, ompi_win_t *win) -{ - unsigned int incoming_reqs; - int ret = OMPI_SUCCESS, i, len, started_send; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - int num_outgoing = 0; - - if (0 != (assert & MPI_MODE_NOPRECEDE)) { - /* check that the user didn't lie to us - since NOPRECEDED - must be specified by all processes if it is specified by - any process, if we see this it is safe to assume that there - are no pending operations anywhere needed to close out this - epoch. */ - if (0 != opal_list_get_size(&(module->m_pending_sendreqs))) { - return MPI_ERR_RMA_SYNC; - } - - } else { - /* "atomically" copy all the data we're going to be modifying - into the copy... */ - OPAL_THREAD_LOCK(&module->m_lock); - ompi_osc_rdma_flip_sendreqs(module); - OPAL_THREAD_UNLOCK(&module->m_lock); - - num_outgoing = opal_list_get_size(&(module->m_copy_pending_sendreqs)); - - /* find out how much data everyone is going to send us. Need - to have the lock during this period so that we have a sane - view of the number of sendreqs */ - ret = module->m_comm-> - c_coll.coll_reduce_scatter(module->m_copy_num_pending_sendreqs, - &incoming_reqs, - module->m_fence_coll_counts, - MPI_UNSIGNED, - MPI_SUM, - module->m_comm, - module->m_comm->c_coll.coll_reduce_scatter_module); - - if (OMPI_SUCCESS != ret) { - /* put the stupid data back for the user. This is not - cheap, but the user lost his data if we don't. */ - OPAL_THREAD_LOCK(&(module->m_lock)); - opal_list_join(&module->m_pending_sendreqs, - opal_list_get_end(&module->m_pending_sendreqs), - &module->m_copy_pending_sendreqs); - - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - module->m_num_pending_sendreqs[i] += - module->m_copy_num_pending_sendreqs[i]; - } - - OPAL_THREAD_UNLOCK(&(module->m_lock)); - return ret; - } - - /* try to start all the requests. We've copied everything we - need out of pending_sendreqs, so don't need the lock - here */ - len = opal_list_get_size(&(module->m_copy_pending_sendreqs)); - started_send = 0; - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "fence: trying to start %d reqs", - len)); - for (i = 0 ; i < len ; ++i) { - ompi_osc_rdma_sendreq_t *req = (ompi_osc_rdma_sendreq_t*) - opal_list_remove_first(&(module->m_copy_pending_sendreqs)); - - ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_SUCCESS != ret) { - opal_list_append(&(module->m_copy_pending_sendreqs), (opal_list_item_t*)req); - } else { - started_send = 1; - } - } - - /* we need to start at least one send, so that the callback - will restart the rest. */ - while (0 == started_send && len != 0) { - opal_progress(); - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "fence: restarting %d reqs", len)); - len = opal_list_get_size(&(module->m_copy_pending_sendreqs)); - for (i = 0 ; i < len ; ++i) { - ompi_osc_rdma_sendreq_t *req = (ompi_osc_rdma_sendreq_t*) - opal_list_remove_first(&(module->m_copy_pending_sendreqs)); - - ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_list_append(&(module->m_copy_pending_sendreqs), (opal_list_item_t*)req); - } else if (OMPI_SUCCESS != ret) { - return ret; - } else { - started_send = 1; - } - } - } - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "fence: done with initial start")); - - if (module->m_use_rdma) { - if (module->m_rdma_wait_completion) { - OPAL_THREAD_LOCK(&module->m_lock); - while (module->m_rdma_num_pending != 0) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - } - - for (i = 0 ; i < ompi_comm_size(module->m_comm) ; ++i) { - int j; - for (j = 0 ; j < module->m_peer_info[i].peer_num_btls ; ++j) { - if (module->m_peer_info[i].peer_btls[j].num_sent > 0) { - ret = ompi_osc_rdma_rdma_ack_send(module, - ompi_comm_peer_lookup(module->m_comm, i), - &(module->m_peer_info[i].peer_btls[j])); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - module->m_peer_info[i].peer_btls[j].num_sent = 0; - } else { - return ret; - } - } - } - } - } - - ompi_osc_rdma_flush(module); - - OPAL_THREAD_LOCK(&module->m_lock); - /* if some requests couldn't be started, push into the - "queued" list, where we will try to restart them later. */ - if (opal_list_get_size(&module->m_copy_pending_sendreqs)) { - opal_list_join(&module->m_queued_sendreqs, - opal_list_get_end(&module->m_queued_sendreqs), - &module->m_copy_pending_sendreqs); - } - - /* possible we've already received a couple in messages, so - atomicall add however many we're going to wait for */ - module->m_num_pending_in += incoming_reqs; - module->m_num_pending_out += num_outgoing; - - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "fence: waiting on %d in and %d out, now %d, %d", - incoming_reqs, - num_outgoing, - module->m_num_pending_in, - module->m_num_pending_out)); - - /* now we know how many things we're waiting for - wait for them... */ - while (module->m_num_pending_in > 0 || - 0 != module->m_num_pending_out) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - } - - /* all transfers are done - back to the real world we go */ - if (0 == (assert & MPI_MODE_NOSUCCEED)) { - ompi_win_set_mode(win, OMPI_WIN_FENCE); - } else { - ompi_win_set_mode(win, 0); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_module_start(ompi_group_t *group, - int assert, - ompi_win_t *win) -{ - int i, ret = OMPI_SUCCESS; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - int32_t count; - - OBJ_RETAIN(group); - ompi_group_increment_proc_count(group); - - module->m_eager_send_active = false; - - OPAL_THREAD_LOCK(&module->m_lock); - - if (NULL != module->m_sc_group) { - OPAL_THREAD_UNLOCK(&module->m_lock); - ret = MPI_ERR_RMA_SYNC; - goto clean; - } - module->m_sc_group = group; - - /* possible we've already received a couple in messages, so - add however many we're going to wait for */ - count = (module->m_num_post_msgs += ompi_group_size(module->m_sc_group)); - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - memset(module->m_sc_remote_active_ranks, 0, - sizeof(bool) * ompi_comm_size(module->m_comm)); - - /* for each process in the specified group, find it's rank in our - communicator, store those indexes, and set the true / false in - the active ranks table */ - for (i = 0 ; i < ompi_group_size(group) ; i++) { - int comm_rank = -1, j; - - /* find the rank in the communicator associated with this windows */ - for (j = 0 ; j < ompi_comm_size(module->m_comm) ; ++j) { - if (ompi_group_peer_lookup(module->m_sc_group, i) == - ompi_comm_peer_lookup(module->m_comm, j)) { - comm_rank = j; - break; - } - } - if (comm_rank == -1) { - ret = MPI_ERR_RMA_SYNC; - goto clean; - } - - module->m_sc_remote_active_ranks[comm_rank] = true; - module->m_sc_remote_ranks[i] = comm_rank; - } - - /* Set our mode to access w/ start */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); - - if (count == 0) { - module->m_eager_send_active = module->m_eager_send_ok; - } - - return OMPI_SUCCESS; - - clean: - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - return ret; -} - - -int -ompi_osc_rdma_module_complete(ompi_win_t *win) -{ - int i, j; - int ret = OMPI_SUCCESS; - ompi_group_t *group; - opal_list_item_t *item; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - /* wait for all the post messages */ - OPAL_THREAD_LOCK(&module->m_lock); - while (0 != module->m_num_post_msgs) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - - ompi_osc_rdma_flip_sendreqs(module); - - /* for each process in group, send a control message with number - of updates coming, then start all the requests */ - module->m_num_pending_out += - (int32_t) opal_list_get_size(&module->m_copy_pending_sendreqs); - OPAL_THREAD_UNLOCK(&module->m_lock); - - for (i = 0 ; i < ompi_group_size(module->m_sc_group) ; ++i) { - int comm_rank = module->m_sc_remote_ranks[i]; - if (module->m_use_rdma) { - if (module->m_rdma_wait_completion) { - OPAL_THREAD_LOCK(&module->m_lock); - while (module->m_rdma_num_pending != 0) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - } - - for (j = 0 ; j < module->m_peer_info[comm_rank].peer_num_btls ; ++j) { - if (module->m_peer_info[comm_rank].peer_btls[j].num_sent > 0) { - ret = ompi_osc_rdma_rdma_ack_send(module, - ompi_group_peer_lookup(module->m_sc_group, i), - &(module->m_peer_info[comm_rank].peer_btls[j])); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - module->m_peer_info[comm_rank].peer_btls[j].num_sent = 0; - } else { - return ret; - } - } - } - } - ret = ompi_osc_rdma_control_send(module, - ompi_group_peer_lookup(module->m_sc_group, i), - OMPI_OSC_RDMA_HDR_COMPLETE, - module->m_copy_num_pending_sendreqs[comm_rank], - 0); - assert(ret == OMPI_SUCCESS); - } - - /* try to start all the requests. We've copied everything we - need out of pending_sendreqs, so don't need the lock - here */ - while (NULL != - (item = opal_list_remove_first(&(module->m_copy_pending_sendreqs)))) { - ompi_osc_rdma_sendreq_t *req = - (ompi_osc_rdma_sendreq_t*) item; - - ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_list_append(&(module->m_copy_pending_sendreqs), item); - break; - } else if (OMPI_SUCCESS != ret) { - return ret; - } - } - - ompi_osc_rdma_flush(module); - - OPAL_THREAD_LOCK(&module->m_lock); - /* if some requests couldn't be started, push into the - "queued" list, where we will try to restart them later. */ - if (opal_list_get_size(&module->m_copy_pending_sendreqs)) { - opal_list_join(&module->m_queued_sendreqs, - opal_list_get_end(&module->m_queued_sendreqs), - &module->m_copy_pending_sendreqs); - } - - /* wait for all the requests */ - while (0 != module->m_num_pending_out) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - - group = module->m_sc_group; - module->m_sc_group = NULL; - - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - /* remove WIN_POSTED from our mode */ - ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return ret; -} - - -int -ompi_osc_rdma_module_post(ompi_group_t *group, - int assert, - ompi_win_t *win) -{ - int i; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - OBJ_RETAIN(group); - ompi_group_increment_proc_count(group); - - OPAL_THREAD_LOCK(&(module->m_lock)); - assert(NULL == module->m_pw_group); - module->m_pw_group = group; - - /* Set our mode to expose w/ post */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - /* list how many complete counters we're still waiting on */ - module->m_num_complete_msgs += - ompi_group_size(module->m_pw_group); - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - /* send a hello counter to everyone in group */ - for (i = 0 ; i < ompi_group_size(module->m_pw_group) ; ++i) { - ompi_osc_rdma_control_send(module, - ompi_group_peer_lookup(group, i), - OMPI_OSC_RDMA_HDR_POST, 1, 0); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_module_wait(ompi_win_t *win) -{ - ompi_group_t *group; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - - OPAL_THREAD_LOCK(&module->m_lock); - while (0 != (module->m_num_pending_in) || - 0 != (module->m_num_complete_msgs)) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - - group = module->m_pw_group; - module->m_pw_group = NULL; - OPAL_THREAD_UNLOCK(&module->m_lock); - - ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_module_test(ompi_win_t *win, - int *flag) -{ - ompi_group_t *group; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - -#if !OMPI_ENABLE_PROGRESS_THREADS - opal_progress(); -#endif - - if (0 != (module->m_num_pending_in) || - 0 != (module->m_num_complete_msgs)) { - *flag = 0; - return OMPI_SUCCESS; - } - - *flag = 1; - - OPAL_THREAD_LOCK(&(module->m_lock)); - group = module->m_pw_group; - module->m_pw_group = NULL; - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); - - ompi_group_decrement_proc_count(group); - OBJ_RELEASE(group); - - return OMPI_SUCCESS; -} - - -struct ompi_osc_rdma_pending_lock_t { - opal_list_item_t super; - ompi_proc_t *proc; - int32_t lock_type; -}; -typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t; -OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t, - NULL, NULL); - - -int -ompi_osc_rdma_module_lock(int lock_type, - int target, - int assert, - ompi_win_t *win) -{ - ompi_osc_rdma_module_t *module = GET_MODULE(win); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, target ); - - assert(lock_type != 0); - - /* set our mode on the window */ - ompi_win_remove_mode(win, OMPI_WIN_FENCE); - ompi_win_append_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); - - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "%d sending lock request to %d", - ompi_comm_rank(module->m_comm), target)); - /* generate a lock request */ - ompi_osc_rdma_control_send(module, - proc, - OMPI_OSC_RDMA_HDR_LOCK_REQ, - ompi_comm_rank(module->m_comm), - lock_type); - - module->m_eager_send_active = false; - - if (ompi_comm_rank(module->m_comm) == target) { - /* If we're trying to lock locally, have to wait to actually - acquire the lock */ - OPAL_THREAD_LOCK(&module->m_lock); - while (module->m_lock_received_ack == 0) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - } - - /* return */ - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_module_unlock(int target, - ompi_win_t *win) -{ - int32_t out_count; - opal_list_item_t *item; - int ret; - ompi_osc_rdma_module_t *module = GET_MODULE(win); - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, target ); - - OPAL_THREAD_LOCK(&module->m_lock); - while (0 == module->m_lock_received_ack) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - - module->m_lock_received_ack -= 1; - - /* start all the requests */ - ompi_osc_rdma_flip_sendreqs(module); - - /* try to start all the requests. We've copied everything we need - out of pending_sendreqs, so don't need the lock here */ - out_count = opal_list_get_size(&module->m_copy_pending_sendreqs); - - /* we want to send all the requests, plus we wait for one more - completion event for the control message ack from the unlocker - saying we're done */ - module->m_num_pending_out += (out_count + 1); - OPAL_THREAD_UNLOCK(&module->m_lock); - - /* send the unlock request */ - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "%d sending unlock request to %d with %d requests", - ompi_comm_rank(module->m_comm), target, - out_count)); - ompi_osc_rdma_control_send(module, - proc, - OMPI_OSC_RDMA_HDR_UNLOCK_REQ, - ompi_comm_rank(module->m_comm), - out_count); - - /* try to start all the requests. We've copied everything we - need out of pending_sendreqs, so don't need the lock - here */ - while (NULL != - (item = opal_list_remove_first(&(module->m_copy_pending_sendreqs)))) { - ompi_osc_rdma_sendreq_t *req = - (ompi_osc_rdma_sendreq_t*) item; - - ret = ompi_osc_rdma_sendreq_send(module, req); - if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) { - opal_list_append(&(module->m_copy_pending_sendreqs), item); - break; - } else if (OMPI_SUCCESS != ret) { - return ret; - } - } - - ompi_osc_rdma_flush(module); - - OPAL_THREAD_LOCK(&module->m_lock); - /* if some requests couldn't be started, push into the - "queued" list, where we will try to restart them later. */ - if (opal_list_get_size(&module->m_copy_pending_sendreqs)) { - opal_list_join(&module->m_queued_sendreqs, - opal_list_get_end(&module->m_queued_sendreqs), - &module->m_copy_pending_sendreqs); - } - - /* wait for all the requests */ - while (0 != module->m_num_pending_out) { - opal_condition_wait(&module->m_cond, &module->m_lock); - } - OPAL_THREAD_UNLOCK(&module->m_lock); - - /* set our mode on the window */ - ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); - - module->m_eager_send_active = module->m_eager_send_ok; - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module, - int32_t origin, - int32_t lock_type) -{ - bool send_ack = false; - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, origin ); - ompi_osc_rdma_pending_lock_t *new_pending; - - OPAL_THREAD_LOCK(&(module->m_lock)); - if (lock_type == MPI_LOCK_EXCLUSIVE) { - if (module->m_lock_status == 0) { - module->m_lock_status = MPI_LOCK_EXCLUSIVE; - ompi_win_append_mode(module->m_win, OMPI_WIN_EXPOSE_EPOCH); - send_ack = true; - } else { - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "%d queuing lock request from %d (%d)", - ompi_comm_rank(module->m_comm), - origin, lock_type)); - new_pending = OBJ_NEW(ompi_osc_rdma_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = lock_type; - opal_list_append(&(module->m_locks_pending), &(new_pending->super)); - } - } else if (lock_type == MPI_LOCK_SHARED) { - if (module->m_lock_status != MPI_LOCK_EXCLUSIVE) { - module->m_lock_status = MPI_LOCK_SHARED; - module->m_shared_count++; - ompi_win_append_mode(module->m_win, OMPI_WIN_EXPOSE_EPOCH); - send_ack = true; - } else { - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "queuing lock request from %d (%d) lock_type:%d", - ompi_comm_rank(module->m_comm), - origin, lock_type)); - new_pending = OBJ_NEW(ompi_osc_rdma_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = lock_type; - opal_list_append(&(module->m_locks_pending), &(new_pending->super)); - } - } - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - if (send_ack) { - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "%d sending lock ack to %d", - ompi_comm_rank(module->m_comm), origin)); - ompi_osc_rdma_control_send(module, proc, - OMPI_OSC_RDMA_HDR_LOCK_REQ, - ompi_comm_rank(module->m_comm), - OMPI_SUCCESS); - } - - return OMPI_SUCCESS; -} - - -int -ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module, - int32_t origin, - int32_t count) -{ - ompi_proc_t *proc = ompi_comm_peer_lookup( module->m_comm, origin ); - ompi_osc_rdma_pending_lock_t *new_pending = NULL; - - assert(module->m_lock_status != 0); - - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "received unlock request from %d with %d requests\n", - origin, count)); - - new_pending = OBJ_NEW(ompi_osc_rdma_pending_lock_t); - new_pending->proc = proc; - new_pending->lock_type = 0; - OPAL_THREAD_LOCK(&(module->m_lock)); - module->m_num_pending_in += count; - opal_list_append(&module->m_unlocks_pending, &(new_pending->super)); - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - return ompi_osc_rdma_passive_unlock_complete(module); -} - - -int -ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module) -{ - ompi_osc_rdma_pending_lock_t *new_pending = NULL; - opal_list_t copy_unlock_acks; - - if (module->m_num_pending_in != 0) return OMPI_SUCCESS; - - OPAL_THREAD_LOCK(&module->m_lock); - if (module->m_num_pending_in != 0) { - OPAL_THREAD_UNLOCK(&module->m_lock); - return OMPI_SUCCESS; - } - - if (module->m_lock_status == MPI_LOCK_EXCLUSIVE) { - ompi_win_remove_mode(module->m_win, OMPI_WIN_EXPOSE_EPOCH); - module->m_lock_status = 0; - } else { - module->m_shared_count -= opal_list_get_size(&module->m_unlocks_pending); - if (module->m_shared_count == 0) { - ompi_win_remove_mode(module->m_win, OMPI_WIN_EXPOSE_EPOCH); - module->m_lock_status = 0; - } - } - - OBJ_CONSTRUCT(©_unlock_acks, opal_list_t); - /* copy over any unlocks that have been satisfied (possibly - multiple if SHARED) */ - opal_list_join(©_unlock_acks, - opal_list_get_end(©_unlock_acks), - &module->m_unlocks_pending); - OPAL_THREAD_UNLOCK(&module->m_lock); - - /* issue whichever unlock acks we should issue */ - while (NULL != (new_pending = (ompi_osc_rdma_pending_lock_t*) - opal_list_remove_first(©_unlock_acks))) { - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "sending unlock reply to proc")); - ompi_osc_rdma_control_send(module, - new_pending->proc, - OMPI_OSC_RDMA_HDR_UNLOCK_REPLY, - OMPI_SUCCESS, OMPI_SUCCESS); - OBJ_RELEASE(new_pending); - } - - OBJ_DESTRUCT(©_unlock_acks); - - /* if we were really unlocked, see if we have another lock request - we can satisfy */ - OPAL_THREAD_LOCK(&(module->m_lock)); - if (0 == module->m_lock_status) { - new_pending = (ompi_osc_rdma_pending_lock_t*) - opal_list_remove_first(&(module->m_locks_pending)); - if (NULL != new_pending) { - ompi_win_append_mode(module->m_win, OMPI_WIN_EXPOSE_EPOCH); - /* set lock state and generate a lock request */ - module->m_lock_status = new_pending->lock_type; - if (MPI_LOCK_SHARED == new_pending->lock_type) { - module->m_shared_count++; - } - } - } else { - new_pending = NULL; - } - OPAL_THREAD_UNLOCK(&(module->m_lock)); - - if (NULL != new_pending) { - OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, - "sending lock request to proc")); - ompi_osc_rdma_control_send(module, - new_pending->proc, - OMPI_OSC_RDMA_HDR_LOCK_REQ, - ompi_comm_rank(module->m_comm), - OMPI_SUCCESS); - OBJ_RELEASE(new_pending); - } - - return OMPI_SUCCESS; -} diff --git a/ompi/mca/osc/sm/Makefile.am b/ompi/mca/osc/sm/Makefile.am new file mode 100644 index 0000000000..5eab56f0f7 --- /dev/null +++ b/ompi/mca/osc/sm/Makefile.am @@ -0,0 +1,44 @@ +# +# Copyright (c) 2011 Sandia National Laboratories. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = + +sm_sources = \ + osc_sm.h \ + osc_sm_comm.c \ + osc_sm_component.c \ + osc_sm_active_target.c \ + osc_sm_passive_target.c \ + osc_sm_request.c \ + osc_sm_request.h + +AM_CPPFLAGS = $(osc_sm_CPPFLAGS) + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_osc_sm_DSO +component_noinst = +component_install = mca_osc_sm.la +else +component_noinst = libmca_osc_sm.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_osc_sm_la_SOURCES = $(sm_sources) +mca_osc_sm_la_LIBADD = $(osc_sm_LIBS) +mca_osc_sm_la_LDFLAGS = -module -avoid-version $(osc_sm_LDFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_osc_sm_la_SOURCES = $(sm_sources) +libmca_osc_sm_la_LIBADD = $(osc_sm_LIBS) +libmca_osc_sm_la_LDFLAGS = -module -avoid-version $(osc_sm_LDFLAGS) diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h new file mode 100644 index 0000000000..31f513a54f --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm.h @@ -0,0 +1,243 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_SM_SM_H +#define OSC_SM_SM_H + +#include "ompi/class/ompi_free_list.h" +#include "opal/mca/shmem/base/base.h" + +/* data shared across all peers */ +struct ompi_osc_sm_global_state_t { + int use_barrier_for_fence; + +#if OPAL_HAVE_POSIX_THREADS + pthread_mutex_t mtx; + pthread_cond_t cond; + + int sense; + int32_t count; +#endif +}; +typedef struct ompi_osc_sm_global_state_t ompi_osc_sm_global_state_t; + +/* this is data exposed to remote nodes */ +struct ompi_osc_sm_lock_t { + uint32_t counter; + uint32_t write; + uint32_t read; +}; +typedef struct ompi_osc_sm_lock_t ompi_osc_sm_lock_t; + +struct ompi_osc_sm_node_state_t { + int32_t post_count; + int32_t complete_count; + ompi_osc_sm_lock_t lock; + opal_atomic_lock_t accumulate_lock; +}; +typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t; + +struct ompi_osc_sm_component_t { + ompi_osc_base_component_t super; + ompi_free_list_t requests; +}; +typedef struct ompi_osc_sm_component_t ompi_osc_sm_component_t; +OMPI_DECLSPEC extern ompi_osc_sm_component_t mca_osc_sm_component; + +enum ompi_osc_sm_locktype_t { + lock_none = 0, + lock_nocheck, + lock_exclusive, + lock_shared +}; + +struct ompi_osc_sm_module_t { + ompi_osc_base_module_t super; + struct ompi_communicator_t *comm; + int flavor; + opal_shmem_ds_t seg_ds; + void *segment_base; + bool noncontig; + + size_t *sizes; + void **bases; + int *disp_units; + + ompi_group_t *start_group; + ompi_group_t *post_group; + +#if OPAL_HAVE_POSIX_THREADS + int my_sense; +#endif + + enum ompi_osc_sm_locktype_t *outstanding_locks; + + /* exposed data */ + ompi_osc_sm_global_state_t *global_state; + ompi_osc_sm_node_state_t *my_node_state; + ompi_osc_sm_node_state_t *node_states; +}; +typedef struct ompi_osc_sm_module_t ompi_osc_sm_module_t; + +int ompi_osc_sm_shared_query(struct ompi_win_t *win, int rank, size_t *size, int *disp_unit, void *baseptr); + +int ompi_osc_sm_attach(struct ompi_win_t *win, void *base, size_t len); +int ompi_osc_sm_detach(struct ompi_win_t *win, void *base); + +int ompi_osc_sm_free(struct ompi_win_t *win); + +int ompi_osc_sm_put(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_sm_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_sm_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_sm_compare_and_swap(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win); + +int ompi_osc_sm_fetch_and_op(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_sm_get_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_sm_rput(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_sm_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_sm_raccumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_sm_rget_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_datatype, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_datatype, + int target_rank, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_datatype, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_sm_fence(int assert, struct ompi_win_t *win); + +int ompi_osc_sm_start(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win); + +int ompi_osc_sm_complete(struct ompi_win_t *win); + +int ompi_osc_sm_post(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win); + +int ompi_osc_sm_wait(struct ompi_win_t *win); + +int ompi_osc_sm_test(struct ompi_win_t *win, + int *flag); + +int ompi_osc_sm_lock(int lock_type, + int target, + int assert, + struct ompi_win_t *win); + +int ompi_osc_sm_unlock(int target, + struct ompi_win_t *win); + + +int ompi_osc_sm_lock_all(int assert, + struct ompi_win_t *win); + +int ompi_osc_sm_unlock_all(struct ompi_win_t *win); + +int ompi_osc_sm_sync(struct ompi_win_t *win); + +int ompi_osc_sm_flush(int target, + struct ompi_win_t *win); +int ompi_osc_sm_flush_all(struct ompi_win_t *win); +int ompi_osc_sm_flush_local(int target, + struct ompi_win_t *win); +int ompi_osc_sm_flush_local_all(struct ompi_win_t *win); + +int ompi_osc_sm_set_info(struct ompi_win_t *win, struct ompi_info_t *info); +int ompi_osc_sm_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used); + +#endif diff --git a/ompi/mca/osc/sm/osc_sm_active_target.c b/ompi/mca/osc/sm/osc_sm_active_target.c new file mode 100644 index 0000000000..17a78da4e0 --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_active_target.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/sys/atomic.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_sm.h" + + +int +ompi_osc_sm_fence(int assert, struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + /* ensure all memory operations have completed */ + opal_atomic_mb(); + + if (module->global_state->use_barrier_for_fence) { + return module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + } else { + module->my_sense = !module->my_sense; + pthread_mutex_lock(&module->global_state->mtx); + module->global_state->count--; + if (module->global_state->count == 0) { + module->global_state->count = ompi_comm_size(module->comm); + module->global_state->sense = module->my_sense; + pthread_cond_broadcast(&module->global_state->cond); + } else { + while (module->global_state->sense != module->my_sense) { + pthread_cond_wait(&module->global_state->cond, &module->global_state->mtx); + } + } + pthread_mutex_unlock(&module->global_state->mtx); + + return OMPI_SUCCESS; + } +} + + +int +ompi_osc_sm_start(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + int size; + + OBJ_RETAIN(group); + module->start_group = group; + size = ompi_group_size(module->start_group); + + while (module->my_node_state->post_count != size) { + opal_progress(); + opal_atomic_mb(); + } + } else { + module->start_group = NULL; + } + + opal_atomic_mb(); + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_complete(struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int i, j, gsize, csize; + + /* ensure all memory operations have completed */ + opal_atomic_mb(); + + if (NULL != module->start_group) { + module->my_node_state->post_count = 0; + opal_atomic_mb(); + + gsize = ompi_group_size(module->start_group); + csize = ompi_comm_size(module->comm); + for (i = 0 ; i < gsize ; ++i) { + for (j = 0 ; i < csize ; ++j) { + if (ompi_group_peer_lookup(module->start_group, i) == + ompi_comm_peer_lookup(module->comm, j)) { + opal_atomic_add_32(&module->node_states[j].complete_count, 1); + } + } + } + + OBJ_RELEASE(module->start_group); + module->start_group = NULL; + } + + opal_atomic_mb(); + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_post(struct ompi_group_t *group, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int i, j, gsize, csize; + + if (0 == (assert & MPI_MODE_NOCHECK)) { + OBJ_RETAIN(group); + module->post_group = group; + + module->my_node_state->complete_count = 0; + opal_atomic_mb(); + + gsize = ompi_group_size(module->post_group); + csize = ompi_comm_size(module->comm); + for (i = 0 ; i < gsize ; ++i) { + for (j = 0 ; i < csize ; ++j) { + if (ompi_group_peer_lookup(module->start_group, i) == + ompi_comm_peer_lookup(module->comm, j)) { + opal_atomic_add_32(&module->node_states[j].post_count, 1); + } + } + } + } else { + module->post_group = NULL; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_wait(struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (NULL != module->post_group) { + int size = ompi_group_size(module->post_group); + + while (module->my_node_state->complete_count != size) { + opal_progress(); + opal_atomic_mb(); + } + + OBJ_RELEASE(module->post_group); + module->post_group = NULL; + } + + /* ensure all memory operations have completed */ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_test(struct ompi_win_t *win, + int *flag) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (NULL != module->post_group) { + int size = ompi_group_size(module->post_group); + + if (module->my_node_state->complete_count == size) { + OBJ_RELEASE(module->post_group); + module->post_group = NULL; + *flag = 1; + } + } else { + opal_atomic_mb(); + *flag = 0; + } + + /* ensure all memory operations have completed */ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/sm/osc_sm_comm.c b/ompi/mca/osc/sm/osc_sm_comm.c new file mode 100644 index 0000000000..64a1b813b7 --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_comm.c @@ -0,0 +1,451 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_sm.h" +#include "osc_sm_request.h" + + +int +ompi_osc_sm_rput(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_request_t *request; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rput: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + OMPI_OSC_SM_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_SM_REQUEST_RETURN(request); + return ret; + } + + OMPI_OSC_SM_REQUEST_COMPLETE(request); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_request_t *request; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + OMPI_OSC_SM_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, + origin_addr, origin_count, origin_dt); + if (OMPI_SUCCESS != ret) { + OMPI_OSC_SM_REQUEST_RETURN(request); + return ret; + } + + OMPI_OSC_SM_REQUEST_COMPLETE(request); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_raccumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_request_t *request; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + OMPI_OSC_SM_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + if (op == &ompi_mpi_op_replace.op) { + ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + } else { + ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt, + op); + } + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + OMPI_OSC_SM_REQUEST_COMPLETE(request); + + return ret; +} + + + +int +ompi_osc_sm_rget_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_dt, + int target, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + int ret; + ompi_osc_sm_request_t *request; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "rget_accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + OMPI_OSC_SM_REQUEST_ALLOC(win, request); + if (NULL == request) return OMPI_ERR_OUT_OF_RESOURCE; + *ompi_req = &request->super; + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + + ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, + result_addr, result_count, result_dt); + if (OMPI_SUCCESS != ret || op == &ompi_mpi_op_no_op.op) goto done; + + if (op == &ompi_mpi_op_replace.op) { + return ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + } else { + ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt, + op); + } + + done: + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + OMPI_OSC_SM_REQUEST_COMPLETE(request); + + return ret; +} + + +int +ompi_osc_sm_put(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "put: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + + return ret; +} + + +int +ompi_osc_sm_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "get: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, + origin_addr, origin_count, origin_dt); + + return ret; +} + + +int +ompi_osc_sm_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + if (op == &ompi_mpi_op_replace.op) { + ret = ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + } else { + ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt, + op); + } + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + return ret; +} + + +int +ompi_osc_sm_get_accumulate(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + void *result_addr, + int result_count, + struct ompi_datatype_t *result_dt, + int target, + MPI_Aint target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + int ret; + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "get_accumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, 0x%lx", + (unsigned long) origin_addr, origin_count, + origin_dt->name, target, (int) target_disp, + target_count, target_dt->name, + op->o_name, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + + ret = ompi_datatype_sndrcv(remote_address, target_count, target_dt, + result_addr, result_count, result_dt); + if (OMPI_SUCCESS != ret || op == &ompi_mpi_op_no_op.op) goto done; + + if (op == &ompi_mpi_op_replace.op) { + return ompi_datatype_sndrcv(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt); + } else { + ret = ompi_osc_base_sndrcv_op(origin_addr, origin_count, origin_dt, + remote_address, target_count, target_dt, + op); + } + + done: + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + return ret; +} + + +int +ompi_osc_sm_compare_and_swap(void *origin_addr, + void *compare_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + size_t size; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "compare_and_swap: 0x%lx, %s, %d, %d, 0x%lx", + (unsigned long) origin_addr, + dt->name, target, (int) target_disp, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + ompi_datatype_type_size(dt, &size); + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + + /* fetch */ + ompi_datatype_copy_content_same_ddt(dt, 1, (char*) result_addr, (char*) remote_address); + /* compare */ + if (0 == memcmp(result_addr, compare_addr, size)) { + /* set */ + ompi_datatype_copy_content_same_ddt(dt, 1, (char*) remote_address, (char*) origin_addr); + } + + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_fetch_and_op(void *origin_addr, + void *result_addr, + struct ompi_datatype_t *dt, + int target, + OPAL_PTRDIFF_TYPE target_disp, + struct ompi_op_t *op, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + void *remote_address; + + OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, + "fetch_and_op: 0x%lx, %s, %d, %d, %s, 0x%lx", + (unsigned long) origin_addr, + dt->name, target, (int) target_disp, + op->o_name, + (unsigned long) win)); + + remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; + + opal_atomic_lock(&module->node_states[target].accumulate_lock); + + /* fetch */ + ompi_datatype_copy_content_same_ddt(dt, 1, (char*) result_addr, (char*) remote_address); + if (op == &ompi_mpi_op_no_op.op) goto done; + + /* op */ + if (op == &ompi_mpi_op_replace.op) { + ompi_datatype_copy_content_same_ddt(dt, 1, (char*) remote_address, (char*) origin_addr); + } else { + ompi_op_reduce(op, origin_addr, remote_address, 1, dt); + } + + done: + opal_atomic_unlock(&module->node_states[target].accumulate_lock); + + return OMPI_SUCCESS;; +} diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c new file mode 100644 index 0000000000..096b80d60c --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -0,0 +1,510 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "ompi/request/request.h" +#include "ompi/class/ompi_free_list.h" + +#include "osc_sm.h" +#include "osc_sm_request.h" + +static int component_open(void); +static int component_init(bool enable_progress_threads, bool enable_mpi_threads); +static int component_finalize(void); +static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor); +static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model); + + +ompi_osc_sm_component_t mca_osc_sm_component = { + { /* ompi_osc_base_component_t */ + { /* ompi_base_component_t */ + OMPI_OSC_BASE_VERSION_3_0_0, + "sm", + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + component_open, + NULL + }, + { /* mca_base_component_data */ + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + component_init, + component_query, + component_select, + component_finalize + } +}; + + +ompi_osc_sm_module_t ompi_osc_sm_module_template = { + { + ompi_osc_sm_shared_query, + + ompi_osc_sm_attach, + ompi_osc_sm_detach, + ompi_osc_sm_free, + + ompi_osc_sm_put, + ompi_osc_sm_get, + ompi_osc_sm_accumulate, + ompi_osc_sm_compare_and_swap, + ompi_osc_sm_fetch_and_op, + ompi_osc_sm_get_accumulate, + + ompi_osc_sm_rput, + ompi_osc_sm_rget, + ompi_osc_sm_raccumulate, + ompi_osc_sm_rget_accumulate, + + ompi_osc_sm_fence, + + ompi_osc_sm_start, + ompi_osc_sm_complete, + ompi_osc_sm_post, + ompi_osc_sm_wait, + ompi_osc_sm_test, + + ompi_osc_sm_lock, + ompi_osc_sm_unlock, + ompi_osc_sm_lock_all, + ompi_osc_sm_unlock_all, + + ompi_osc_sm_sync, + ompi_osc_sm_flush, + ompi_osc_sm_flush_all, + ompi_osc_sm_flush_local, + ompi_osc_sm_flush_local_all, + + ompi_osc_sm_set_info, + ompi_osc_sm_get_info + } +}; + + +static int +component_open(void) +{ + return OMPI_SUCCESS; +} + + +static int +component_init(bool enable_progress_threads, bool enable_mpi_threads) +{ + int ret; + + ret = ompi_free_list_init(&mca_osc_sm_component.requests, + sizeof(ompi_request_t), + OBJ_CLASS(ompi_request_t), + 0, + 0, + 8, + NULL); + if (OMPI_SUCCESS != ret) { + opal_output_verbose(1, ompi_osc_base_framework.framework_output, + "%s:%d: ompi_free_list_init failed: %d\n", + __FILE__, __LINE__, ret); + return ret; + } + + return OMPI_SUCCESS; +} + + +static int +component_finalize(void) +{ + /* clean up requests free list */ + + return OMPI_SUCCESS; +} + + +static int +check_win_ok(ompi_communicator_t *comm, int flavor) +{ + int i; + + if (! (MPI_WIN_FLAVOR_SHARED == flavor + || MPI_WIN_FLAVOR_ALLOCATE == flavor) ) { + return -1; + } + + for (i = 0 ; i < ompi_comm_size(comm) ; ++i) { + if (!OPAL_PROC_ON_LOCAL_NODE(ompi_comm_peer_lookup(comm, i)->proc_flags)) { + return -1; + } + } + + return 0; +} + + +static int +component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor) +{ + if (0 != check_win_ok(comm, flavor)) return -1; + + return 100; +} + + +static int +component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct ompi_info_t *info, + int flavor, int *model) +{ + ompi_osc_sm_module_t *module = NULL; + int ret = OMPI_ERROR; + + if (0 != check_win_ok(comm, flavor)) return OMPI_ERR_NOT_SUPPORTED; + + /* create module structure */ + module = (ompi_osc_sm_module_t*) + calloc(1, sizeof(ompi_osc_sm_module_t)); + if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + /* fill in the function pointer part */ + memcpy(module, &ompi_osc_sm_module_template, + sizeof(ompi_osc_base_module_t)); + + /* need our communicator for collectives in next phase */ + ret = ompi_comm_dup(comm, &module->comm); + if (OMPI_SUCCESS != ret) goto error; + + module->flavor = flavor; + + /* create the segment */ + if (1 == ompi_comm_size(comm)) { + module->segment_base = NULL; + module->sizes = malloc(sizeof(size_t)); + if (NULL == module->sizes) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + module->bases = malloc(sizeof(void*)); + if (NULL == module->bases) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + module->sizes[0] = size; + module->bases[0] = malloc(size); + if (NULL == module->bases[0]) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + module->global_state = malloc(sizeof(ompi_osc_sm_global_state_t)); + if (NULL == module->global_state) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + module->node_states = malloc(sizeof(ompi_osc_sm_node_state_t)); + if (NULL == module->node_states) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + } else { + char *data_file; + int *rbuf; + int total, i, flag; + size_t pagesize; + size_t state_size; + + OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, + "allocating shared memory region of size %ld\n", (long) size)); + +#ifdef HAVE_GETPAGESIZE + pagesize = getpagesize(); +#else + pagesize = 4096; +#endif + + rbuf = malloc(sizeof(int) * ompi_comm_size(module->comm)); + if (NULL == rbuf) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + module->noncontig = false; + if (OMPI_SUCCESS != ompi_info_get_bool(info, "alloc_shared_noncontig", + &module->noncontig, &flag)) { + goto error; + } + + if (module->noncontig) { + total = ((size - 1) / pagesize + 1) * pagesize; + } else { + total = size; + } + ret = module->comm->c_coll.coll_allgather(&total, 1, MPI_INT, + rbuf, 1, MPI_INT, + module->comm, + module->comm->c_coll.coll_allgather_module); + if (OMPI_SUCCESS != ret) return ret; + + total = 0; + for (i = 0 ; i < ompi_comm_size(comm) ; ++i) { + total += rbuf[i]; + } + + + if (asprintf(&data_file, "%s"OPAL_PATH_SEP"shared_window_%d.%s", + orte_process_info.job_session_dir, + ompi_comm_get_cid(comm), + orte_process_info.nodename) < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* user opal/shmem directly to create a shared memory segment */ + state_size = sizeof(ompi_osc_sm_global_state_t) + sizeof(ompi_osc_sm_node_state_t) * ompi_comm_size(module->comm); + if (0 == ompi_comm_rank (module->comm)) { + ret = opal_shmem_segment_create (&module->seg_ds, data_file, total + pagesize + state_size); + if (OPAL_SUCCESS != ret) { + goto error; + } + } + + ret = module->comm->c_coll.coll_bcast (&module->seg_ds, sizeof (module->seg_ds), MPI_BYTE, 0, + module->comm, module->comm->c_coll.coll_bcast_module); + if (OMPI_SUCCESS != ret) { + goto error; + } + + module->segment_base = opal_shmem_segment_attach (&module->seg_ds); + if (NULL == module->segment_base) { + goto error; + } + + module->sizes = malloc(sizeof(size_t) * ompi_comm_size(module->comm)); + if (NULL == module->sizes) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + module->bases = malloc(sizeof(void*) * ompi_comm_size(module->comm)); + if (NULL == module->bases) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + module->global_state = (ompi_osc_sm_global_state_t *) (module->segment_base); + module->node_states = (ompi_osc_sm_node_state_t *) (module->global_state + 1); + + for (i = 0, total = state_size ; i < ompi_comm_size(comm) ; ++i) { + module->sizes[i] = rbuf[i]; + module->bases[i] = ((char *) module->segment_base) + total; + total += rbuf[i]; + } + + free(rbuf); + } + + /* initialize my state shared */ + module->my_node_state = &module->node_states[ompi_comm_rank(module->comm)]; + *base = module->bases[ompi_comm_rank(module->comm)]; + + module->my_node_state->post_count = 0; + module->my_node_state->complete_count = 0; + bzero(&module->my_node_state->lock, sizeof(ompi_osc_sm_lock_t)); + opal_atomic_init(&module->my_node_state->accumulate_lock, OPAL_ATOMIC_UNLOCKED); + + /* share everyone's displacement units. */ + module->disp_units = malloc(sizeof(int) * ompi_comm_size(module->comm)); + ret = module->comm->c_coll.coll_allgather(&disp_unit, 1, MPI_INT, + module->disp_units, 1, MPI_INT, + module->comm, + module->comm->c_coll.coll_allgather_module); + if (OMPI_SUCCESS != ret) goto error; + + module->start_group = NULL; + module->post_group = NULL; + + /* initialize synchronization code */ + module->my_sense = 1; + + module->outstanding_locks = malloc(sizeof(enum ompi_osc_sm_locktype_t) * ompi_comm_size(module->comm)); + if (NULL == module->outstanding_locks) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + bzero(module->outstanding_locks, sizeof(enum ompi_osc_sm_locktype_t) * ompi_comm_size(module->comm)); + + if (0 == ompi_comm_rank(module->comm)) { +#if OPAL_HAVE_POSIX_THREADS + pthread_mutexattr_t mattr; + pthread_condattr_t cattr; + bool blocking_fence; + int flag; + + if (OMPI_SUCCESS != ompi_info_get_bool(info, "blocking_fence", + &blocking_fence, &flag)) { + goto error; + } + + if (blocking_fence) { + ret = pthread_mutexattr_init(&mattr); + ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); + if (ret != 0) { + module->global_state->use_barrier_for_fence = 1; + } else { + ret = pthread_mutex_init(&module->global_state->mtx, &mattr); + if (ret != 0) { + module->global_state->use_barrier_for_fence = 1; + } else { + pthread_condattr_init(&cattr); + pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED); + ret = pthread_cond_init(&module->global_state->cond, &cattr); + if (ret != 0) return OMPI_ERROR; + pthread_condattr_destroy(&cattr); + } + } + module->global_state->use_barrier_for_fence = 0; + module->global_state->sense = module->my_sense; + module->global_state->count = ompi_comm_size(module->comm); + pthread_mutexattr_destroy(&mattr); + } else { + module->global_state->use_barrier_for_fence = 1; + } +#else + module->global_state->use_barrier_for_fence = 1; +#endif + } + + ret = module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + if (OMPI_SUCCESS != ret) goto error; + + *model = MPI_WIN_UNIFIED; + + win->w_osc_module = &module->super; + + return OMPI_SUCCESS; + + error: + if (NULL != module->comm) ompi_comm_free(&module->comm); + if (NULL != module) free(module); + + return ret; +} + + +int +ompi_osc_sm_shared_query(struct ompi_win_t *win, int rank, size_t *size, int *disp_unit, void *baseptr) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (module->flavor != MPI_WIN_FLAVOR_SHARED) { + return MPI_ERR_WIN; + } + + if (MPI_PROC_NULL != rank) { + *size = module->sizes[rank]; + *((void**) baseptr) = module->bases[rank]; + *disp_unit = module->disp_units[rank]; + } else { + int i = 0; + + *size = 0; + *((void**) baseptr) = NULL; + *disp_unit = 0; + for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) { + if (0 != module->sizes[i]) { + *size = module->sizes[i]; + *((void**) baseptr) = module->bases[i]; + *disp_unit = module->disp_units[i]; + break; + } + } + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_attach(struct ompi_win_t *win, void *base, size_t len) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { + return MPI_ERR_RMA_ATTACH; + } + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_detach(struct ompi_win_t *win, void *base) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { + return MPI_ERR_RMA_ATTACH; + } + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_free(struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + /* synchronize */ + module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); + + /* free memory */ + if (NULL == module->segment_base) { + free(module->node_states); + free(module->global_state); + free(module->bases[0]); + free(module->bases); + free(module->sizes); + } else { + opal_shmem_segment_detach (&module->seg_ds); + } + + /* cleanup */ + ompi_comm_free(&module->comm); + free(module); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_set_info(struct ompi_win_t *win, struct ompi_info_t *info) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + /* enforce collectiveness... */ + return module->comm->c_coll.coll_barrier(module->comm, + module->comm->c_coll.coll_barrier_module); +} + + +int +ompi_osc_sm_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + + ompi_info_t *info = OBJ_NEW(ompi_info_t); + if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + + if (module->flavor == MPI_WIN_FLAVOR_SHARED) { + ompi_info_set(info, "blocking_fence", + (1 == module->global_state->use_barrier_for_fence) ? "true" : "false"); + ompi_info_set(info, "alloc_shared_noncontig", + (module->noncontig) ? "true" : "false"); + } + + *info_used = info; + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/sm/osc_sm_passive_target.c b/ompi/mca/osc/sm/osc_sm_passive_target.c new file mode 100644 index 0000000000..bcef1e8dc7 --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_passive_target.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_sm.h" + + +static inline uint32_t +lk_fetch_add32(ompi_osc_sm_module_t *module, + int target, + size_t offset, + uint32_t delta) +{ + return opal_atomic_add_32((int32_t*) ((char*) &module->node_states[target].lock + offset), + delta); +} + + +static inline void +lk_add32(ompi_osc_sm_module_t *module, + int target, + size_t offset, + uint32_t delta) +{ + opal_atomic_add_32((int32_t*) ((char*) &module->node_states[target].lock + offset), + delta); +} + + +static inline uint32_t +lk_fetch32(ompi_osc_sm_module_t *module, + int target, + size_t offset) +{ + __sync_synchronize(); + return (uint32_t) *((char*) &module->node_states[target].lock + offset); +} + + +static inline int +start_exclusive(ompi_osc_sm_module_t *module, + int target) +{ + uint32_t me = lk_fetch_add32(module, target, + offsetof(ompi_osc_sm_lock_t, counter), 1); + + while (me != lk_fetch32(module, target, + offsetof(ompi_osc_sm_lock_t, write))) { + opal_progress(); + } + + return OMPI_SUCCESS; +} + + +static inline int +end_exclusive(ompi_osc_sm_module_t *module, + int target) +{ + lk_add32(module, target, offsetof(ompi_osc_sm_lock_t, write), 1); + lk_add32(module, target, offsetof(ompi_osc_sm_lock_t, read), 1); + + return OMPI_SUCCESS; +} + + +static inline int +start_shared(ompi_osc_sm_module_t *module, + int target) +{ + uint32_t me = lk_fetch_add32(module, target, + offsetof(ompi_osc_sm_lock_t, counter), 1); + + while (me != lk_fetch32(module, target, + offsetof(ompi_osc_sm_lock_t, read))) { + opal_progress(); + } + + lk_add32(module, target, offsetof(ompi_osc_sm_lock_t, read), 1); + + return OMPI_SUCCESS; +} + + +static inline int +end_shared(ompi_osc_sm_module_t *module, + int target) +{ + lk_add32(module, target, offsetof(ompi_osc_sm_lock_t, write), 1); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_lock(int lock_type, + int target, + int assert, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int ret; + + if (lock_none != module->outstanding_locks[target]) { + return MPI_ERR_RMA_SYNC; + } + + if (0 == (assert & MPI_MODE_NOCHECK)) { + if (MPI_LOCK_EXCLUSIVE == lock_type) { + module->outstanding_locks[target] = lock_exclusive; + ret = start_exclusive(module, target); + } else { + module->outstanding_locks[target] = lock_shared; + ret = start_shared(module, target); + } + } else { + module->outstanding_locks[target] = lock_nocheck; + ret = OMPI_SUCCESS; + } + + return ret; +} + + +int +ompi_osc_sm_unlock(int target, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int ret; + + /* ensure all memory operations have completed */ + opal_atomic_mb(); + + if (module->outstanding_locks[target] == lock_nocheck) { + ret = OMPI_SUCCESS; + } else if (module->outstanding_locks[target] == lock_exclusive) { + ret = end_exclusive(module, target); + } else if (module->outstanding_locks[target] == lock_shared) { + ret = end_shared(module, target); + } else { + ret = MPI_ERR_RMA_SYNC; + } + + return ret; +} + + +int +ompi_osc_sm_lock_all(int assert, + struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int ret, i, comm_size; + + comm_size = ompi_comm_size(module->comm); + for (i = 0 ; i < comm_size ; ++i) { + ret = ompi_osc_sm_lock(MPI_LOCK_SHARED, i, assert, win); + if (OMPI_SUCCESS != ret) return ret; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_unlock_all(struct ompi_win_t *win) +{ + ompi_osc_sm_module_t *module = + (ompi_osc_sm_module_t*) win->w_osc_module; + int ret, i, comm_size; + + comm_size = ompi_comm_size(module->comm); + for (i = 0 ; i < comm_size ; ++i) { + ret = ompi_osc_sm_unlock(i, win); + if (OMPI_SUCCESS != ret) return ret; + } + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_sync(struct ompi_win_t *win) +{ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_flush(int target, + struct ompi_win_t *win) +{ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_flush_all(struct ompi_win_t *win) +{ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_flush_local(int target, + struct ompi_win_t *win) +{ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} + + +int +ompi_osc_sm_flush_local_all(struct ompi_win_t *win) +{ + opal_atomic_mb(); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/sm/osc_sm_request.c b/ompi/mca/osc/sm/osc_sm_request.c new file mode 100644 index 0000000000..74af54212d --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_request.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/request/request.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" + +#include "osc_sm.h" +#include "osc_sm_request.h" + +static int +request_cancel(struct ompi_request_t *request, int complete) +{ + return MPI_ERR_REQUEST; +} + +static int +request_free(struct ompi_request_t **ompi_req) +{ + ompi_osc_sm_request_t *request = + (ompi_osc_sm_request_t*) *ompi_req; + + if (true != request->super.req_complete) { + return MPI_ERR_REQUEST; + } + + OMPI_OSC_SM_REQUEST_RETURN(request); + + *ompi_req = MPI_REQUEST_NULL; + + return OMPI_SUCCESS; +} + +static +void +request_construct(ompi_osc_sm_request_t *request) +{ + request->super.req_type = OMPI_REQUEST_WIN; + request->super.req_status._cancelled = 0; + request->super.req_free = request_free; + request->super.req_cancel = request_cancel; +} + +OBJ_CLASS_INSTANCE(ompi_osc_sm_request_t, + ompi_request_t, + request_construct, + NULL); diff --git a/ompi/mca/osc/sm/osc_sm_request.h b/ompi/mca/osc/sm/osc_sm_request.h new file mode 100644 index 0000000000..780522b2cb --- /dev/null +++ b/ompi/mca/osc/sm/osc_sm_request.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_SM_REQUEST_H +#define OSC_SM_REQUEST_H + +#include "ompi/request/request.h" + +struct ompi_osc_sm_request_t { + ompi_request_t super; +}; +typedef struct ompi_osc_sm_request_t ompi_osc_sm_request_t; + +OBJ_CLASS_DECLARATION(ompi_osc_sm_request_t); + +/* REQUEST_ALLOC is only called from "top-level" functions (sm_rput, + sm_rget, etc.), so it's ok to spin here... */ +#define OMPI_OSC_SM_REQUEST_ALLOC(win, req) \ + do { \ + ompi_free_list_item_t *item = NULL; \ + do { \ + OMPI_FREE_LIST_GET_MT(&mca_osc_sm_component.requests, item); \ + if (NULL == item) { \ + opal_progress(); \ + } \ + } while (NULL == item); \ + req = (ompi_osc_sm_request_t*) item; \ + OMPI_REQUEST_INIT(&req->super, false); \ + req->super.req_mpi_object.win = win; \ + req->super.req_complete = false; \ + req->super.req_state = OMPI_REQUEST_ACTIVE; \ + req->super.req_status._ucount = 0; \ + } while (0) + +#define OMPI_OSC_SM_REQUEST_RETURN(req) \ + do { \ + OMPI_REQUEST_FINI(&request->super); \ + OMPI_FREE_LIST_RETURN_MT(&mca_osc_sm_component.requests, \ + (ompi_free_list_item_t*) req); \ + } while (0) + +#define OMPI_OSC_SM_REQUEST_COMPLETE(req) \ + do { \ + OPAL_THREAD_LOCK(&ompi_request_lock); \ + ompi_request_complete(&req->super, true); \ + OPAL_THREAD_UNLOCK(&ompi_request_lock); \ + } while (0) + +#endif diff --git a/ompi/mca/pml/cm/pml_cm_component.c b/ompi/mca/pml/cm/pml_cm_component.c index f383592d26..7710cdc11b 100644 --- a/ompi/mca/pml/cm/pml_cm_component.c +++ b/ompi/mca/pml/cm/pml_cm_component.c @@ -8,6 +8,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/ompi/mpi/c/Makefile.am b/ompi/mpi/c/Makefile.am index 9ddceb4140..fafe2eaa04 100644 --- a/ompi/mpi/c/Makefile.am +++ b/ompi/mpi/c/Makefile.am @@ -140,6 +140,7 @@ libmpi_c_mpi_la_SOURCES = \ comm_split.c \ comm_split_type.c \ comm_test_inter.c \ + compare_and_swap.c \ dims_create.c \ errhandler_c2f.c \ errhandler_create.c \ @@ -150,6 +151,7 @@ libmpi_c_mpi_la_SOURCES = \ error_class.c \ error_string.c \ exscan.c \ + fetch_and_op.c \ iexscan.c \ finalize.c \ finalized.c \ @@ -162,6 +164,7 @@ libmpi_c_mpi_la_SOURCES = \ get_count.c \ get_elements.c \ get_elements_x.c \ + get_accumulate.c \ get_library_version.c \ get_processor_name.c \ get_version.c \ @@ -243,6 +246,7 @@ libmpi_c_mpi_la_SOURCES = \ probe.c \ publish_name.c \ query_thread.c \ + raccumulate.c \ recv_init.c \ recv.c \ reduce.c \ @@ -256,6 +260,9 @@ libmpi_c_mpi_la_SOURCES = \ request_f2c.c \ request_free.c \ request_get_status.c \ + rget.c \ + rget_accumulate.c \ + rput.c \ rsend_init.c \ rsend.c \ scan.c \ @@ -336,29 +343,44 @@ libmpi_c_mpi_la_SOURCES = \ accumulate.c \ get.c \ put.c \ + win_allocate.c \ + win_allocate_shared.c \ + win_attach.c \ win_c2f.c \ win_call_errhandler.c \ win_complete.c \ win_create_errhandler.c \ win_create_keyval.c \ win_create.c \ + win_create_dynamic.c \ win_delete_attr.c \ + win_detach.c \ win_f2c.c \ win_fence.c \ + win_flush.c \ + win_flush_all.c \ + win_flush_local.c \ + win_flush_local_all.c \ win_free_keyval.c \ win_free.c \ win_get_attr.c \ win_get_errhandler.c \ win_get_group.c \ + win_get_info.c \ win_get_name.c \ win_lock.c \ + win_lock_all.c \ win_post.c \ win_set_attr.c \ win_set_errhandler.c \ + win_set_info.c \ win_set_name.c \ + win_shared_query.c \ + win_sync.c \ win_start.c \ win_test.c \ win_unlock.c \ + win_unlock_all.c \ win_wait.c if OMPI_PROVIDE_MPI_FILE_INTERFACE diff --git a/ompi/mpi/c/accumulate.c b/ompi/mpi/c/accumulate.c index d862875111..ced8c5fff6 100644 --- a/ompi/mpi/c/accumulate.c +++ b/ompi/mpi/c/accumulate.c @@ -52,7 +52,7 @@ int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origi MEMCHECKER( memchecker_datatype(origin_datatype); memchecker_datatype(target_datatype); - memchecker_call(&opal_memchecker_base_isdefined, origin_addr, origin_count, origin_datatype); + memchecker_call(&opal_memchecker_base_isdefined, (void *) origin_addr, origin_count, origin_datatype); ); if (MPI_PARAM_CHECK) { @@ -67,12 +67,10 @@ int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origi } else if (ompi_win_peer_invalid(win, target_rank) && (MPI_PROC_NULL != target_rank)) { rc = MPI_ERR_RANK; - } else if (MPI_OP_NULL == op) { + } else if (MPI_OP_NULL == op || MPI_NO_OP == op) { rc = MPI_ERR_OP; } else if (!ompi_op_is_intrinsic(op)) { rc = MPI_ERR_OP; - } else if (!ompi_win_comm_allowed(win)) { - rc = MPI_ERR_RMA_SYNC; } else if ( target_disp < 0 ) { rc = MPI_ERR_DISP; } else { @@ -86,7 +84,7 @@ int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origi for other reduction operators, we don't require such behavior, as checking for it is expensive here and we don't care in implementation.. */ - if (op != &ompi_mpi_op_replace.op) { + if (op != &ompi_mpi_op_replace.op && op != &ompi_mpi_op_no_op.op) { ompi_datatype_t *op_check_dt, *origin_check_dt; char *msg; diff --git a/ompi/mpi/c/compare_and_swap.c b/ompi/mpi/c/compare_and_swap.c new file mode 100644 index 0000000000..95531538e6 --- /dev/null +++ b/ompi/mpi/c/compare_and_swap.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Compare_and_swap = PMPI_Compare_and_swap +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Compare_and_swap"; + + +int MPI_Compare_and_swap(void *origin_addr, void *compare_addr, void *result_addr, + MPI_Datatype datatype, int target_rank, MPI_Aint target_disp, MPI_Win win) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, datatype, 1); + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + OPAL_CR_ENTER_LIBRARY(); + + rc = win->w_osc_module->osc_compare_and_swap(origin_addr, compare_addr, result_addr, + datatype, target_rank, target_disp, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/fetch_and_op.c b/ompi/mpi/c/fetch_and_op.c new file mode 100644 index 0000000000..128e750670 --- /dev/null +++ b/ompi/mpi/c/fetch_and_op.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Fetch_and_op = PMPI_Fetch_and_op +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Fetch_and_op"; + + +int MPI_Fetch_and_op(void *origin_addr, void *result_addr, MPI_Datatype datatype, + int target_rank, MPI_Aint target_disp, MPI_Op op, MPI_Win win) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, datatype, 1); + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) return MPI_SUCCESS; + + OPAL_CR_ENTER_LIBRARY(); + + rc = win->w_osc_module->osc_fetch_and_op(origin_addr, result_addr, datatype, + target_rank, target_disp, op, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/get.c b/ompi/mpi/c/get.c index 6e1435b066..8f1937334e 100644 --- a/ompi/mpi/c/get.c +++ b/ompi/mpi/c/get.c @@ -56,8 +56,6 @@ int MPI_Get(void *origin_addr, int origin_count, } else if (ompi_win_peer_invalid(win, target_rank) && (MPI_PROC_NULL != target_rank)) { rc = MPI_ERR_RANK; - } else if (!ompi_win_comm_allowed(win)) { - rc = MPI_ERR_RMA_SYNC; } else if ( target_disp < 0 ) { rc = MPI_ERR_DISP; } else { diff --git a/ompi/mpi/c/get_accumulate.c b/ompi/mpi/c/get_accumulate.c new file mode 100644 index 0000000000..c1ec4dbcfd --- /dev/null +++ b/ompi/mpi/c/get_accumulate.c @@ -0,0 +1,149 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Sun Microsystmes, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Get_accumulate = PMPI_Get_accumulate +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Get_accumlate"; + +int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win) +{ + int rc; + ompi_win_t *ompi_win = (ompi_win_t*) win; + + MEMCHECKER( + memchecker_datatype(origin_datatype); + memchecker_datatype(target_datatype); + memchecker_call(&opal_memchecker_base_isdefined, (void *) origin_addr, origin_count, origin_datatype); + ); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (MPI_OP_NULL == op) { + rc = MPI_ERR_OP; + } else if (!ompi_op_is_intrinsic(op)) { + rc = MPI_ERR_OP; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + /* the origin datatype is meaningless when using MPI_OP_NO_OP */ + if (&ompi_mpi_op_no_op.op != op) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + } else { + rc = OMPI_SUCCESS; + } + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + if (OMPI_SUCCESS == rc) { + /* While technically the standard probably requires that the + datatypes used with MPI_REPLACE conform to all the rules + for other reduction operators, we don't require such + behavior, as checking for it is expensive here and we don't + care in implementation.. */ + if (op != &ompi_mpi_op_replace.op && op != &ompi_mpi_op_no_op.op) { + ompi_datatype_t *op_check_dt, *origin_check_dt; + char *msg; + + /* GET_ACCUMULATE, unlike REDUCE, can use with derived + datatypes with predefinied operations, with some + restrictions outlined in MPI-2:6.3.4. The derived + datatype must be composed entierly from one predefined + datatype (so you can do all the construction you want, + but at the bottom, you can only use one datatype, say, + MPI_INT). If the datatype at the target isn't + predefined, then make sure it's composed of only one + datatype, and check that datatype against + ompi_op_is_valid(). */ + origin_check_dt = ompi_datatype_get_single_predefined_type_from_args(origin_datatype); + op_check_dt = ompi_datatype_get_single_predefined_type_from_args(target_datatype); + + if( !((origin_check_dt == op_check_dt) & (NULL != op_check_dt)) ) { + OMPI_ERRHANDLER_RETURN(MPI_ERR_ARG, win, MPI_ERR_ARG, FUNC_NAME); + } + + /* check to make sure primitive type is valid for + reduction. Should do this on the target, but + then can't get the errcode back for this + call */ + if (!ompi_op_is_valid(op, op_check_dt, &msg, FUNC_NAME)) { + int ret = OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_OP, msg); + free(msg); + return ret; + } + } + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) { + return MPI_SUCCESS; + } + + OPAL_CR_ENTER_LIBRARY(); + + /* XXX -- TODO: do not cast away the const */ + rc = ompi_win->w_osc_module->osc_get_accumulate((void *) origin_addr, + origin_count, + origin_datatype, + result_addr, + result_count, + result_datatype, + target_rank, + target_disp, + target_count, + target_datatype, + op, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/profile/Makefile.am b/ompi/mpi/c/profile/Makefile.am index a2e2ff8845..0105b3f490 100644 --- a/ompi/mpi/c/profile/Makefile.am +++ b/ompi/mpi/c/profile/Makefile.am @@ -122,6 +122,7 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ pcomm_split.c \ pcomm_split_type.c \ pcomm_test_inter.c \ + pcompare_and_swap.c \ pdims_create.c \ perrhandler_c2f.c \ perrhandler_create.c \ @@ -132,6 +133,7 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ perror_class.c \ perror_string.c \ pexscan.c \ + pfetch_and_op.c \ piexscan.c \ pfinalize.c \ pfinalized.c \ @@ -144,6 +146,7 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ pget_count.c \ pget_elements.c \ pget_elements_x.c \ + pget_accumulate.c \ pget_library_version.c \ pget_processor_name.c \ pget_version.c \ @@ -225,6 +228,7 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ pprobe.c \ ppublish_name.c \ pquery_thread.c \ + praccumulate.c \ precv_init.c \ precv.c \ preduce.c \ @@ -238,6 +242,9 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ prequest_f2c.c \ prequest_free.c \ prequest_get_status.c \ + prget.c \ + prget_accumulate.c \ + prput.c \ prsend_init.c \ prsend.c \ pscan.c \ @@ -318,29 +325,44 @@ nodist_libmpi_c_pmpi_la_SOURCES = \ paccumulate.c \ pget.c \ pput.c \ + pwin_allocate.c \ + pwin_allocate_shared.c \ + pwin_attach.c \ pwin_c2f.c \ pwin_call_errhandler.c \ pwin_complete.c \ pwin_create_errhandler.c \ pwin_create_keyval.c \ pwin_create.c \ + pwin_create_dynamic.c \ pwin_delete_attr.c \ + pwin_detach.c \ pwin_f2c.c \ pwin_fence.c \ + pwin_flush.c \ + pwin_flush_all.c \ + pwin_flush_local.c \ + pwin_flush_local_all.c \ pwin_free_keyval.c \ pwin_free.c \ pwin_get_attr.c \ pwin_get_errhandler.c \ pwin_get_group.c \ + pwin_get_info.c \ pwin_get_name.c \ pwin_lock.c \ + pwin_lock_all.c \ pwin_post.c \ pwin_set_attr.c \ pwin_set_errhandler.c \ + pwin_set_info.c \ pwin_set_name.c \ + pwin_shared_query.c \ pwin_start.c \ + pwin_sync.c \ pwin_test.c \ pwin_unlock.c \ + pwin_unlock_all.c \ pwin_wait.c if OMPI_PROVIDE_MPI_FILE_INTERFACE diff --git a/ompi/mpi/c/profile/defines.h b/ompi/mpi/c/profile/defines.h index 17d512eae8..478fc7cb69 100644 --- a/ompi/mpi/c/profile/defines.h +++ b/ompi/mpi/c/profile/defines.h @@ -108,6 +108,7 @@ #define MPI_Comm_split PMPI_Comm_split #define MPI_Comm_split_type PMPI_Comm_split_type #define MPI_Comm_test_inter PMPI_Comm_test_inter +#define MPI_Compare_and_swap PMPI_Compare_and_swap #define MPI_Dims_create PMPI_Dims_create #define MPI_Errhandler_c2f PMPI_Errhandler_c2f #define MPI_Errhandler_f2c PMPI_Errhandler_f2c @@ -118,6 +119,7 @@ #define MPI_Error_class PMPI_Error_class #define MPI_Error_string PMPI_Error_string #define MPI_Exscan PMPI_Exscan +#define MPI_Fetch_and_op PMPI_Fetch_and_op #define MPI_Iexscan PMPI_Iexscan #define MPI_File_c2f PMPI_File_c2f #define MPI_File_call_errhandler PMPI_File_call_errhandler @@ -188,6 +190,7 @@ #define MPI_Get_elements PMPI_Get_elements #define MPI_Get_elements_x PMPI_Get_elements_x #define MPI_Get PMPI_Get +#define MPI_Get_accumulate PMPI_Get_accumulate #define MPI_Get_library_version PMPI_Get_library_version #define MPI_Get_processor_name PMPI_Get_processor_name #define MPI_Get_version PMPI_Get_version @@ -272,6 +275,7 @@ #define MPI_Publish_name PMPI_Publish_name #define MPI_Put PMPI_Put #define MPI_Query_thread PMPI_Query_thread +#define MPI_Raccumulate PMPI_Raccumulate #define MPI_Recv_init PMPI_Recv_init #define MPI_Recv PMPI_Recv #define MPI_Reduce PMPI_Reduce @@ -286,6 +290,9 @@ #define MPI_Request_f2c PMPI_Request_f2c #define MPI_Request_free PMPI_Request_free #define MPI_Request_get_status PMPI_Request_get_status +#define MPI_Rget PMPI_Rget +#define MPI_Rget_accumulate PMPI_Rget_accumulate +#define MPI_Rput PMPI_Rput #define MPI_Rsend_init PMPI_Rsend_init #define MPI_Rsend PMPI_Rsend #define MPI_Scan PMPI_Scan @@ -361,29 +368,44 @@ #define MPI_Waitall PMPI_Waitall #define MPI_Waitany PMPI_Waitany #define MPI_Waitsome PMPI_Waitsome +#define MPI_Win_allocate PMPI_Win_allocate +#define MPI_Win_allocate_shared PMPI_Win_allocate_shared +#define MPI_Win_attach PMPI_Win_attach #define MPI_Win_c2f PMPI_Win_c2f #define MPI_Win_call_errhandler PMPI_Win_call_errhandler #define MPI_Win_complete PMPI_Win_complete #define MPI_Win_create_errhandler PMPI_Win_create_errhandler #define MPI_Win_create_keyval PMPI_Win_create_keyval #define MPI_Win_create PMPI_Win_create +#define MPI_Win_create_dynamic PMPI_Win_create_dynamic #define MPI_Win_delete_attr PMPI_Win_delete_attr +#define MPI_Win_detach PMPI_Win_detach #define MPI_Win_f2c PMPI_Win_f2c #define MPI_Win_fence PMPI_Win_fence +#define MPI_Win_flush PMPI_Win_flush +#define MPI_Win_flush_all PMPI_Win_flush_all +#define MPI_Win_flush_local PMPI_Win_flush_local +#define MPI_Win_flush_local_all PMPI_Win_flush_local_all #define MPI_Win_free_keyval PMPI_Win_free_keyval #define MPI_Win_free PMPI_Win_free #define MPI_Win_get_attr PMPI_Win_get_attr #define MPI_Win_get_errhandler PMPI_Win_get_errhandler #define MPI_Win_get_group PMPI_Win_get_group +#define MPI_Win_get_info PMPI_Win_get_info #define MPI_Win_get_name PMPI_Win_get_name #define MPI_Win_lock PMPI_Win_lock +#define MPI_Win_lock_all PMPI_Win_lock_all #define MPI_Win_post PMPI_Win_post #define MPI_Win_set_attr PMPI_Win_set_attr #define MPI_Win_set_errhandler PMPI_Win_set_errhandler +#define MPI_Win_set_info PMPI_Win_set_info #define MPI_Win_set_name PMPI_Win_set_name +#define MPI_Win_shared_query PMPI_Win_shared_query #define MPI_Win_start PMPI_Win_start +#define MPI_Win_sync PMPI_Win_sync #define MPI_Win_test PMPI_Win_test #define MPI_Win_unlock PMPI_Win_unlock +#define MPI_Win_unlock_all PMPI_Win_unlock_all #define MPI_Win_wait PMPI_Win_wait #define MPI_Wtick PMPI_Wtick #define MPI_Wtime PMPI_Wtime diff --git a/ompi/mpi/c/put.c b/ompi/mpi/c/put.c index 8d95ab23ff..dc7dbb8f2b 100644 --- a/ompi/mpi/c/put.c +++ b/ompi/mpi/c/put.c @@ -59,8 +59,6 @@ int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datat } else if (ompi_win_peer_invalid(win, target_rank) && (MPI_PROC_NULL != target_rank)) { rc = MPI_ERR_RANK; - } else if (!ompi_win_comm_allowed(win)) { - rc = MPI_ERR_RMA_SYNC; } else if (NULL == target_datatype || MPI_DATATYPE_NULL == target_datatype) { rc = MPI_ERR_TYPE; diff --git a/ompi/mpi/c/raccumulate.c b/ompi/mpi/c/raccumulate.c new file mode 100644 index 0000000000..055a04fde8 --- /dev/null +++ b/ompi/mpi/c/raccumulate.c @@ -0,0 +1,141 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Sun Microsystmes, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Raccumulate = PMPI_Raccumulate +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Accumlate"; + +int MPI_Raccumulate(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request) +{ + int rc; + ompi_win_t *ompi_win = (ompi_win_t*) win; + + MEMCHECKER( + memchecker_datatype(origin_datatype); + memchecker_datatype(target_datatype); + memchecker_call(&opal_memchecker_base_isdefined, origin_addr, origin_count, origin_datatype); + ); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (MPI_OP_NULL == op || MPI_NO_OP == op) { + rc = MPI_ERR_OP; + } else if (!ompi_op_is_intrinsic(op)) { + rc = MPI_ERR_OP; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + if (OMPI_SUCCESS == rc) { + /* While technically the standard probably requires that the + datatypes used with MPI_REPLACE conform to all the rules + for other reduction operators, we don't require such + behavior, as checking for it is expensive here and we don't + care in implementation.. */ + if (op != &ompi_mpi_op_replace.op && op != &ompi_mpi_op_no_op.op) { + ompi_datatype_t *op_check_dt, *origin_check_dt; + char *msg; + + /* RACCUMULATE, unlike REDUCE, can use with derived + datatypes with predefinied operations, with some + restrictions outlined in MPI-2:6.3.4. The derived + datatype must be composed entierly from one predefined + datatype (so you can do all the construction you want, + but at the bottom, you can only use one datatype, say, + MPI_INT). If the datatype at the target isn't + predefined, then make sure it's composed of only one + datatype, and check that datatype against + ompi_op_is_valid(). */ + origin_check_dt = ompi_datatype_get_single_predefined_type_from_args(origin_datatype); + op_check_dt = ompi_datatype_get_single_predefined_type_from_args(target_datatype); + + if( !((origin_check_dt == op_check_dt) & (NULL != op_check_dt)) ) { + OMPI_ERRHANDLER_RETURN(MPI_ERR_ARG, win, MPI_ERR_ARG, FUNC_NAME); + } + + /* check to make sure primitive type is valid for + reduction. Should do this on the target, but + then can't get the errcode back for this + call */ + if (!ompi_op_is_valid(op, op_check_dt, &msg, FUNC_NAME)) { + int ret = OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_OP, msg); + free(msg); + return ret; + } + } + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + + OPAL_CR_ENTER_LIBRARY(); + + /* TODO: don't cast away the const */ + rc = ompi_win->w_osc_module->osc_raccumulate((void*) origin_addr, + origin_count, + origin_datatype, + target_rank, + target_disp, + target_count, + target_datatype, + op, win, request); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/rget.c b/ompi/mpi/c/rget.c new file mode 100644 index 0000000000..85c396abe8 --- /dev/null +++ b/ompi/mpi/c/rget.c @@ -0,0 +1,84 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. ALl rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Rget = PMPI_Rget +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Rget"; + + +int MPI_Rget(void *origin_addr, int origin_count, + MPI_Datatype origin_datatype, int target_rank, + MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + + OPAL_CR_ENTER_LIBRARY(); + + rc = win->w_osc_module->osc_rget(origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, win, request); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/rget_accumulate.c b/ompi/mpi/c/rget_accumulate.c new file mode 100644 index 0000000000..4bebb9503b --- /dev/null +++ b/ompi/mpi/c/rget_accumulate.c @@ -0,0 +1,151 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Sun Microsystmes, Inc. All rights reserved. + * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All right + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/op/op.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/datatype/ompi_datatype_internal.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Rget_accumulate = PMPI_Rget_accumulate +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Rget_accumlate"; + +int MPI_Rget_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + void *result_addr, int result_count, MPI_Datatype result_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request) +{ + int rc; + ompi_win_t *ompi_win = (ompi_win_t*) win; + + MEMCHECKER( + memchecker_datatype(origin_datatype); + memchecker_datatype(target_datatype); + memchecker_call(&opal_memchecker_base_isdefined, (void *) origin_addr, origin_count, origin_datatype); + ); + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (MPI_OP_NULL == op) { + rc = MPI_ERR_OP; + } else if (!ompi_op_is_intrinsic(op)) { + rc = MPI_ERR_OP; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + /* the origin datatype is meaningless when using MPI_OP_NO_OP */ + if (&ompi_mpi_op_no_op.op != op) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + } else { + rc = OMPI_SUCCESS; + } + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + if (OMPI_SUCCESS == rc) { + /* While technically the standard probably requires that the + datatypes used with MPI_REPLACE conform to all the rules + for other reduction operators, we don't require such + behavior, as checking for it is expensive here and we don't + care in implementation.. */ + if (op != &ompi_mpi_op_replace.op && op != &ompi_mpi_op_no_op.op) { + ompi_datatype_t *op_check_dt, *origin_check_dt; + char *msg; + + /* RGET_ACCUMULATE, unlike REDUCE, can use with derived + datatypes with predefinied operations, with some + restrictions outlined in MPI-2:6.3.4. The derived + datatype must be composed entierly from one predefined + datatype (so you can do all the construction you want, + but at the bottom, you can only use one datatype, say, + MPI_INT). If the datatype at the target isn't + predefined, then make sure it's composed of only one + datatype, and check that datatype against + ompi_op_is_valid(). */ + origin_check_dt = ompi_datatype_get_single_predefined_type_from_args(origin_datatype); + op_check_dt = ompi_datatype_get_single_predefined_type_from_args(target_datatype); + + if( !((origin_check_dt == op_check_dt) & (NULL != op_check_dt)) ) { + OMPI_ERRHANDLER_RETURN(MPI_ERR_ARG, win, MPI_ERR_ARG, FUNC_NAME); + } + + /* check to make sure primitive type is valid for + reduction. Should do this on the target, but + then can't get the errcode back for this + call */ + if (!ompi_op_is_valid(op, op_check_dt, &msg, FUNC_NAME)) { + int ret = OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_OP, msg); + free(msg); + return ret; + } + } + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + + OPAL_CR_ENTER_LIBRARY(); + + /* TODO: do not cast away the const */ + rc = ompi_win->w_osc_module->osc_rget_accumulate((void *) origin_addr, + origin_count, + origin_datatype, + result_addr, + result_count, + result_datatype, + target_rank, + target_disp, + target_count, + target_datatype, + op, win, request); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/rput.c b/ompi/mpi/c/rput.c new file mode 100644 index 0000000000..16aae363f6 --- /dev/null +++ b/ompi/mpi/c/rput.c @@ -0,0 +1,88 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Rput = PMPI_Rput +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Rput"; + + +int MPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, + int target_rank, MPI_Aint target_disp, int target_count, + MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request) +{ + int rc; + + if (MPI_PARAM_CHECK) { + rc = OMPI_SUCCESS; + + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (origin_count < 0 || target_count < 0) { + rc = MPI_ERR_COUNT; + } else if (ompi_win_peer_invalid(win, target_rank) && + (MPI_PROC_NULL != target_rank)) { + rc = MPI_ERR_RANK; + } else if (NULL == target_datatype || + MPI_DATATYPE_NULL == target_datatype) { + rc = MPI_ERR_TYPE; + } else if ( target_disp < 0 ) { + rc = MPI_ERR_DISP; + } else { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, origin_datatype, origin_count); + if (OMPI_SUCCESS == rc) { + OMPI_CHECK_DATATYPE_FOR_ONE_SIDED(rc, target_datatype, target_count); + } + } + OMPI_ERRHANDLER_CHECK(rc, win, rc, FUNC_NAME); + } + + if (MPI_PROC_NULL == target_rank) { + *request = &ompi_request_empty; + return MPI_SUCCESS; + } + + OPAL_CR_ENTER_LIBRARY(); + + /* TODO: do not cast away the const */ + rc = win->w_osc_module->osc_rput((void *) origin_addr, origin_count, origin_datatype, + target_rank, target_disp, target_count, + target_datatype, win, request); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_allocate.c b/ompi/mpi/c/win_allocate.c new file mode 100644 index 0000000000..9d3c81ed58 --- /dev/null +++ b/ompi/mpi/c/win_allocate.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_allocate = PMPI_Win_allocate +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_allocate"; + + +int MPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win) +{ + int ret = MPI_SUCCESS; + + MEMCHECKER( + memchecker_comm(comm); + ); + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_comm_invalid (comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, + FUNC_NAME); + + } else if (NULL == info || ompi_info_is_freed(info)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_INFO, + FUNC_NAME); + + } else if (NULL == win) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } else if ( size < 0 ) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_SIZE, FUNC_NAME); + } else if ( disp_unit <= 0 ) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_DISP, FUNC_NAME); + } + } + + /* communicator must be an intracommunicator */ + if (OMPI_COMM_IS_INTER(comm)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COMM, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = ompi_win_allocate((size_t)size, disp_unit, info, + comm, baseptr, win); + if (OMPI_SUCCESS != ret) { + *win = MPI_WIN_NULL; + OPAL_CR_EXIT_LIBRARY(); + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } + + OPAL_CR_EXIT_LIBRARY(); + return MPI_SUCCESS; +} diff --git a/ompi/mpi/c/win_allocate_shared.c b/ompi/mpi/c/win_allocate_shared.c new file mode 100644 index 0000000000..07e40b2c9a --- /dev/null +++ b/ompi/mpi/c/win_allocate_shared.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_allocate_shared = PMPI_Win_allocate_shared +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_allocate_shared"; + + +int MPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info, + MPI_Comm comm, void *baseptr, MPI_Win *win) +{ + int ret = MPI_SUCCESS; + + MEMCHECKER( + memchecker_comm(comm); + ); + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_comm_invalid (comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, + FUNC_NAME); + + } else if (NULL == info || ompi_info_is_freed(info)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_INFO, + FUNC_NAME); + + } else if (NULL == win) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } else if ( size < 0 ) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_SIZE, FUNC_NAME); + } + } + + /* communicator must be an intracommunicator */ + if (OMPI_COMM_IS_INTER(comm)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COMM, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = ompi_win_allocate_shared((size_t)size, disp_unit, info, + comm, baseptr, win); + if (OMPI_SUCCESS != ret) { + *win = MPI_WIN_NULL; + OPAL_CR_EXIT_LIBRARY(); + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } + + OPAL_CR_EXIT_LIBRARY(); + return MPI_SUCCESS; +} diff --git a/ompi/mpi/c/win_attach.c b/ompi/mpi/c/win_attach.c new file mode 100644 index 0000000000..9bd52c016d --- /dev/null +++ b/ompi/mpi/c/win_attach.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_attach = PMPI_Win_attach +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_attach"; + +int MPI_Win_attach(MPI_Win win, void *base, MPI_Aint size) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (NULL == base) { + ret = MPI_ERR_ARG; + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_win_attach(win, base, size); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_complete.c b/ompi/mpi/c/win_complete.c index ac77001a92..fa45c38797 100644 --- a/ompi/mpi/c/win_complete.c +++ b/ompi/mpi/c/win_complete.c @@ -45,8 +45,6 @@ int MPI_Win_complete(MPI_Win win) if (ompi_win_invalid(win)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); - } else if (0 == (ompi_win_get_mode(win) & OMPI_WIN_STARTED)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_create_dynamic.c b/ompi/mpi/c/win_create_dynamic.c new file mode 100644 index 0000000000..20772b3caa --- /dev/null +++ b/ompi/mpi/c/win_create_dynamic.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_create_dynamic = PMPI_Win_create_dynamic +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_create_dynamic"; + + +int MPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win) +{ + int ret = MPI_SUCCESS; + + MEMCHECKER( + memchecker_comm(comm); + ); + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_comm_invalid (comm)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_COMM, + FUNC_NAME); + + } else if (NULL == info || ompi_info_is_freed(info)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_INFO, + FUNC_NAME); + + } else if (NULL == win) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } + } + + /* communicator must be an intracommunicator */ + if (OMPI_COMM_IS_INTER(comm)) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_COMM, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create_dynamic window and return */ + ret = ompi_win_create_dynamic(info, comm, win); + if (OMPI_SUCCESS != ret) { + *win = MPI_WIN_NULL; + OPAL_CR_EXIT_LIBRARY(); + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_WIN, FUNC_NAME); + } + + OPAL_CR_EXIT_LIBRARY(); + return MPI_SUCCESS; +} diff --git a/ompi/mpi/c/win_detach.c b/ompi/mpi/c/win_detach.c new file mode 100644 index 0000000000..8d16b5750c --- /dev/null +++ b/ompi/mpi/c/win_detach.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_detach = PMPI_Win_detach +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_detach"; + +int MPI_Win_detach(MPI_Win win, void *base) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (NULL == base) { + ret = MPI_ERR_ARG; + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_win_detach(win, base); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_fence.c b/ompi/mpi/c/win_fence.c index fa6c7a74a5..3a890d2e4b 100644 --- a/ompi/mpi/c/win_fence.c +++ b/ompi/mpi/c/win_fence.c @@ -48,10 +48,6 @@ int MPI_Win_fence(int assert, MPI_Win win) } else if (0 != (assert & ~(MPI_MODE_NOSTORE | MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE | MPI_MODE_NOSUCCEED))) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ASSERT, FUNC_NAME); - } else if (0 != (ompi_win_get_mode(win) & - (OMPI_WIN_POSTED | OMPI_WIN_STARTED))) { - /* If we're in a post or start, we can't be in a fence */ - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_flush.c b/ompi/mpi/c/win_flush.c new file mode 100644 index 0000000000..083baa148b --- /dev/null +++ b/ompi/mpi/c/win_flush.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_flush = PMPI_Win_flush +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_flush"; + +int MPI_Win_flush(int rank, MPI_Win win) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_flush(rank, win); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_flush_all.c b/ompi/mpi/c/win_flush_all.c new file mode 100644 index 0000000000..a0a4039de5 --- /dev/null +++ b/ompi/mpi/c/win_flush_all.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_flush_all = PMPI_Win_flush_all +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_flush_all"; + +int MPI_Win_flush_all(MPI_Win win) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_flush_all(win); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_flush_local.c b/ompi/mpi/c/win_flush_local.c new file mode 100644 index 0000000000..215e9be8ae --- /dev/null +++ b/ompi/mpi/c/win_flush_local.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_flush_local = PMPI_Win_flush_local +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_flush_local"; + +int MPI_Win_flush_local(int rank, MPI_Win win) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_flush_local(rank, win); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_flush_local_all.c b/ompi/mpi/c/win_flush_local_all.c new file mode 100644 index 0000000000..90e61dea7b --- /dev/null +++ b/ompi/mpi/c/win_flush_local_all.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_flush_local_all = PMPI_Win_flush_local_all +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_flush_local_all"; + +int MPI_Win_flush_local_all(MPI_Win win) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + OMPI_ERRHANDLER_CHECK(ret, win, ret, FUNC_NAME); + } + + OPAL_CR_ENTER_LIBRARY(); + + /* create window and return */ + ret = win->w_osc_module->osc_flush_local_all(win); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_free.c b/ompi/mpi/c/win_free.c index 9cca7972f4..527d6c0a0f 100644 --- a/ompi/mpi/c/win_free.c +++ b/ompi/mpi/c/win_free.c @@ -44,10 +44,6 @@ int MPI_Win_free(MPI_Win *win) if (ompi_win_invalid(*win)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); - } else if (OMPI_WIN_ACCESS_EPOCH & ompi_win_get_mode(*win)) { - return OMPI_ERRHANDLER_INVOKE(*win, - MPI_ERR_RMA_SYNC, - FUNC_NAME); } } diff --git a/ompi/mpi/c/win_get_info.c b/ompi/mpi/c/win_get_info.c new file mode 100644 index 0000000000..954852cf97 --- /dev/null +++ b/ompi/mpi/c/win_get_info.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_get_info = PMPI_Win_get_info +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_get_info"; + + +int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used) +{ + int ret; + + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + + if (NULL == info_used) { + return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ARG, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + ret = win->w_osc_module->osc_get_info(win, info_used); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_lock.c b/ompi/mpi/c/win_lock.c index 0fa37c0312..ce8daa8880 100644 --- a/ompi/mpi/c/win_lock.c +++ b/ompi/mpi/c/win_lock.c @@ -52,8 +52,6 @@ int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win) return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RANK, FUNC_NAME); } else if (0 != (assert & ~(MPI_MODE_NOCHECK))) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ASSERT, FUNC_NAME); - } else if (0 != (ompi_win_get_mode(win) & OMPI_WIN_ACCESS_EPOCH)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } else if (! ompi_win_allow_locks(win)) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } diff --git a/ompi/mpi/c/win_lock_all.c b/ompi/mpi/c/win_lock_all.c new file mode 100644 index 0000000000..828316d5d1 --- /dev/null +++ b/ompi/mpi/c/win_lock_all.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_lock_all = PMPI_Win_lock_all +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_lock_all"; + + +int MPI_Win_lock_all(int assert, MPI_Win win) +{ + int rc; + + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (0 != (assert & ~(MPI_MODE_NOCHECK))) { + return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ASSERT, FUNC_NAME); + } else if (! ompi_win_allow_locks(win)) { + return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + rc = win->w_osc_module->osc_lock_all(assert, win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_post.c b/ompi/mpi/c/win_post.c index 6aadfb24c5..a1de2ec18c 100644 --- a/ompi/mpi/c/win_post.c +++ b/ompi/mpi/c/win_post.c @@ -48,8 +48,6 @@ int MPI_Win_post(MPI_Group group, int assert, MPI_Win win) } else if (0 != (assert & ~(MPI_MODE_NOCHECK | MPI_MODE_NOSTORE | MPI_MODE_NOPUT))) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ASSERT, FUNC_NAME); - } else if (0 != (ompi_win_get_mode(win) & OMPI_WIN_EXPOSE_EPOCH)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_set_info.c b/ompi/mpi/c/win_set_info.c new file mode 100644 index 0000000000..f96b370c5c --- /dev/null +++ b/ompi/mpi/c/win_set_info.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_set_info = PMPI_Win_set_info +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_set_info"; + + +int MPI_Win_set_info(MPI_Win win, MPI_Info info) +{ + int ret; + + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + + if (NULL == info || MPI_INFO_NULL == info || + ompi_info_is_freed(info)) { + return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_INFO, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + ret = win->w_osc_module->osc_set_info(win, info); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_shared_query.c b/ompi/mpi/c/win_shared_query.c new file mode 100644 index 0000000000..565e09bc96 --- /dev/null +++ b/ompi/mpi/c/win_shared_query.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_shared_query = PMPI_Win_shared_query +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_shared_query"; + + +int MPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr) +{ + int rc; + size_t tsize; + + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } else if (ompi_win_peer_invalid(win, rank)) { + return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RANK, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + if (NULL != win->w_osc_module->osc_win_shared_query) { + rc = win->w_osc_module->osc_win_shared_query(win, rank, &tsize, disp_unit, baseptr); + *size = tsize; + } else { + rc = MPI_ERR_RMA_FLAVOR; + } + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_start.c b/ompi/mpi/c/win_start.c index 12a0d92d3e..f0e7a9f44b 100644 --- a/ompi/mpi/c/win_start.c +++ b/ompi/mpi/c/win_start.c @@ -47,8 +47,6 @@ int MPI_Win_start(MPI_Group group, int assert, MPI_Win win) return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); } else if (0 != (assert & ~(MPI_MODE_NOCHECK))) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_ASSERT, FUNC_NAME); - } else if (0 != (ompi_win_get_mode(win) & OMPI_WIN_ACCESS_EPOCH)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_sync.c b/ompi/mpi/c/win_sync.c new file mode 100644 index 0000000000..90e3fba93d --- /dev/null +++ b/ompi/mpi/c/win_sync.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" + +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/info/info.h" +#include "ompi/win/win.h" +#include "ompi/memchecker.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_sync = PMPI_Win_sync +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_sync"; + +int MPI_Win_sync(MPI_Win win) +{ + int ret = MPI_SUCCESS; + + /* argument checking */ + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + ret = win->w_osc_module->osc_sync(win); + OMPI_ERRHANDLER_RETURN(ret, win, ret, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_test.c b/ompi/mpi/c/win_test.c index a28adcac78..1a168ad499 100644 --- a/ompi/mpi/c/win_test.c +++ b/ompi/mpi/c/win_test.c @@ -45,8 +45,6 @@ int MPI_Win_test(MPI_Win win, int *flag) if (ompi_win_invalid(win)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); - } else if (0 == (ompi_win_get_mode(win) & OMPI_WIN_POSTED)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_unlock.c b/ompi/mpi/c/win_unlock.c index 28e5d43065..416ed1e810 100644 --- a/ompi/mpi/c/win_unlock.c +++ b/ompi/mpi/c/win_unlock.c @@ -47,8 +47,6 @@ int MPI_Win_unlock(int rank, MPI_Win win) return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); } else if (ompi_win_peer_invalid(win, rank)) { return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RANK, FUNC_NAME); - } else if (0 == (ompi_win_get_mode(win) & OMPI_WIN_LOCK_ACCESS)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/c/win_unlock_all.c b/ompi/mpi/c/win_unlock_all.c new file mode 100644 index 0000000000..a5c85a64bd --- /dev/null +++ b/ompi/mpi/c/win_unlock_all.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "ompi_config.h" +#include + +#include "ompi/mpi/c/bindings.h" +#include "ompi/runtime/params.h" +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/win/win.h" +#include "ompi/mca/osc/osc.h" + +#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES +#pragma weak MPI_Win_unlock_all = PMPI_Win_unlock_all +#endif + +#if OMPI_PROFILING_DEFINES +#include "ompi/mpi/c/profile/defines.h" +#endif + +static const char FUNC_NAME[] = "MPI_Win_unlock_all"; + + +int MPI_Win_unlock_all(MPI_Win win) +{ + int rc; + + if (MPI_PARAM_CHECK) { + OMPI_ERR_INIT_FINALIZE(FUNC_NAME); + + if (ompi_win_invalid(win)) { + return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); + } + } + + OPAL_CR_ENTER_LIBRARY(); + + rc = win->w_osc_module->osc_unlock_all(win); + OMPI_ERRHANDLER_RETURN(rc, win, rc, FUNC_NAME); +} diff --git a/ompi/mpi/c/win_wait.c b/ompi/mpi/c/win_wait.c index 644a7c74d3..3ee9f5bb36 100644 --- a/ompi/mpi/c/win_wait.c +++ b/ompi/mpi/c/win_wait.c @@ -45,8 +45,6 @@ int MPI_Win_wait(MPI_Win win) if (ompi_win_invalid(win)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_WIN, FUNC_NAME); - } else if (0 == (ompi_win_get_mode(win) & OMPI_WIN_POSTED)) { - return OMPI_ERRHANDLER_INVOKE(win, MPI_ERR_RMA_SYNC, FUNC_NAME); } } diff --git a/ompi/mpi/man/man3/MPI_Accumulate.3in b/ompi/mpi/man/man3/MPI_Accumulate.3in index 404c043742..e9f903b91d 100644 --- a/ompi/mpi/man/man3/MPI_Accumulate.3in +++ b/ompi/mpi/man/man3/MPI_Accumulate.3in @@ -1,11 +1,11 @@ .\" -*- nroff -*- -.\" Copyright 2013 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2013-2014 Los Alamos National Security, LLC. All rights reserved. .\" Copyright 2010 Cisco Systems, Inc. All rights reserved. .\" Copyright 2006-2008 Sun Microsystems, Inc. .\" Copyright (c) 1996 Thinking Machines Corporation .TH MPI_Accumulate 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" .SH NAME -\fBMPI_Accumulate \fP \- Combines the contents of the origin buffer with that of a target buffer. +\fBMPI_Accumulate\fP, \fBMPI_Raccumulate\fP \- Combines the contents of the origin buffer with that of a target buffer. .SH SYNTAX .ft R @@ -17,6 +17,12 @@ int MPI_Accumulate(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, MPI_Datatype \fItarget_datatype\fP, MPI_Op \fIop\fP, MPI_Win \fIwin\fP) +int MPI_Raccumulate(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, + MPI_Datatype \fIorigin_datatype\fP, int \fItarget_rank\fP, + MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, + MPI_Datatype \fItarget_datatype\fP, MPI_Op \fIop\fP, MPI_Win \fIwin\fP, + MPI_Request *\fIrequest\fP) + .fi .SH Fortran Syntax (see FORTRAN 77 NOTES) .nf @@ -28,6 +34,13 @@ MPI_ACCUMULATE(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, TARGET_COUNT, TARGET_DATATYPE, OP, WIN, IERROR \fP +MPI_RACCUMULATE(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, + TARGET_DISP, TARGET_COUNT, TARGET_DATATYPE, OP, WIN, REQUEST, IERROR\fP) + \fIORIGIN_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, TARGET_COUNT, + TARGET_DATATYPE, OP, WIN, REQUEST, IERROR \fP + .fi .SH C++ Syntax .nf @@ -79,19 +92,22 @@ Window object (handle). .SH OUTPUT PARAMETER .ft R .TP 1i +MPI_Raccumulate: RMA request +.TP 1i IERROR Fortran only: Error status (integer). .SH DESCRIPTION .ft R -MPI_Accumulate is a function used for one-sided MPI communication that adds the contents of the origin buffer (as defined by \fIorigin_addr\fP, \fIorigin_count\fP, and \fIorigin_datatype\fP) to the buffer specified by the arguments \fItarget_count\fP and \fItarget_datatype\fP, at offset \fItarget_disp\fP, in the target window specified by \fItarget_rank\fP and \fIwin\fP, using the operation \fIop\fP. The target window can only be accessed by processes within the same node. This is similar to MPI_Put, except that data is combined into the target area instead of overwriting it. +\fBMPI_Accumulate\fP is a function used for one-sided MPI communication that adds the contents of the origin buffer (as defined by \fIorigin_addr\fP, \fIorigin_count\fP, and \fIorigin_datatype\fP) to the buffer specified by the arguments \fItarget_count\fP and \fItarget_datatype\fP, at offset \fItarget_disp\fP, in the target window specified by \fItarget_rank\fP and \fIwin\fP, using the operation \fIop\fP. The target window can only be accessed by processes within the same node. This is similar to MPI_Put, except that data is combined into the target area instead of overwriting it. .sp Any of the predefined operations for MPI_Reduce can be used. User-defined functions cannot be used. For example, if \fIop\fP is MPI_SUM, each element of the origin buffer is added to the corresponding element in the target, replacing the former value in the target. .sp Each datatype argument must be a predefined data type or a derived data type, where all basic components are of the same predefined data type. Both datatype arguments must be constructed from the same predefined data type. The operation \fIop\fP applies to elements of that predefined type. The \fItarget_datatype\fP argument must not specify overlapping entries, and the target buffer must fit in the target window. .sp A new predefined operation, MPI_REPLACE, is defined. It corresponds to the associative function f(a, b) =b; that is, the current value in the target memory is replaced by the value supplied by the origin. - +.sp +\fBMPI_Raccumulate\fP is similar to \fBMPI_Accumulate\fP, except that it allocates a communication request object and associates it with the request handle (the argument \fIrequest\fP) that can be used to wait or test for completion. The completion of an \fBMPI_Raccumulate\fP operation indicates that the \fIorigin_addr\fP buffer is free to be updated. It does not indicate that the operation has completed at the target window. .SH FORTRAN 77 NOTES .ft R @@ -107,12 +123,12 @@ where MPI_ADDRESS_KIND is a constant defined in mpif.h and gives the length of the declared integer in bytes. .SH NOTES -MPI_Put is a special case of MPI_Accumulate, with the operation MPI_REPLACE. Note, however, that MPI_Put and MPI_Accumulate have different constraints on concurrent updates. +MPI_Put is a special case of \fBMPI_Accumulate\fP, with the operation MPI_REPLACE. Note, however, that MPI_Put and \fBMPI_Accumulate\fP have different constraints on concurrent updates. .sp It is the user's responsibility to guarantee that, when using the accumulate functions, the target displacement argument is such that accesses to the window are properly aligned according to the data -type arguments in the call to the MPI_Accumulate function. +type arguments in the call to the \fBMPI_Accumulate\fP function. .SH ERRORS Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. C++ functions do not return errors. If the default error handler is set to MPI::ERRORS_THROW_EXCEPTIONS, then on error the C++ exception mechanism will be used to throw an MPI::Exception object. @@ -125,5 +141,5 @@ may be changed with MPI_Comm_set_errhandler; the predefined error handler MPI_ER .ft R .sp MPI_Put -.br +MPI_Get_accumulate MPI_Reduce diff --git a/ompi/mpi/man/man3/MPI_Compare_and_swap.3in b/ompi/mpi/man/man3/MPI_Compare_and_swap.3in new file mode 100644 index 0000000000..969bc817a4 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Compare_and_swap.3in @@ -0,0 +1,96 @@ +.\" -*- nroff -*- +.\" Copyright 2013-2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2006-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Compare_and_swap 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Compare_and_swap\fP \- Perform RMA compare-and-swap + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Compare_and_swap(const void *\fIorigin_addr\fP, const void *\fIcompar_addr\fP, + void *\fresult_addr\fP, MPI_Datatype \fdatatype\fP, int \fItarget_rank\fP, + MPI_Aint \fItarget_disp\fP, MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax (see FORTRAN 77 NOTES) +.nf +INCLUDE 'mpif.h' +MPI_COMPARE_AND_SWAP(\fIORIGIN_ADDR, COMPARE_ADDR, RESULT_ADDR, DATATYPE, TARGET_RANK, + TARGET_DISP, WIN, IERROR\fP) + \fIORIGIN_ADDR\fP, \fICOMPARE_ADDR\fP, \fIRESULT_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIDATATYPE, TARGET_RANK, WIN, IERROR \fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +origin_addr +Initial address of buffer (choice). +.ft R +.TP +compare_addr +Initial address of compare buffer (choice). +.ft R +.TP +result_addr +Initial address of result buffer (choice). +.ft R +.TP +datatype +Data type of the entry in origin, result, and target buffers (handle). +.ft R +.TP 1i +target_rank +Rank of target (nonnegative integer). +.ft R +.TP 1i +target_disp +Displacement from start of window to beginning of target buffer (nonnegative integer). +.ft R +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +This function compares one element of type \fIdatatype\fP in the compare buffer \fIcompare_addr\fP with the buffer at offset \fItarget_disp\fP in the target window specified by \fItarget_rank\fP and \fIwin\fP and replaces the value at the target with the value in the origin buffer \fIorigin_addr\fP if the compare buffer and the target buffer are identical. The original value at the target is returned in the buffer \fIresult_addr\fP. The parameter \fIdatatype\fP must belong to one of the following categories of predefined datatypes: C integer, Fortran integer, Logical, Multi-language types, or Byte as specified in MPI-3 § 5.9.2 on page 176. +.sp +The origin and result buffers (\fIorigin_addr\fP and \fIresult_addr\fP) must be disjoint. + +.SH FORTRAN 77 NOTES +.ft R +The MPI standard prescribes portable Fortran syntax for +the \fITARGET_DISP\fP argument only for Fortran 90. FORTRAN 77 +users may use the non-portable syntax +.sp +.nf + INTEGER*MPI_ADDRESS_KIND \fITARGET_DISP\fP +.fi +.sp +where MPI_ADDRESS_KIND is a constant defined in mpif.h +and gives the length of the declared integer in bytes. + +.SH NOTES +It is the user's responsibility to guarantee that, when +using the accumulate functions, the target displacement argument is such +that accesses to the window are properly aligned according to the data +type arguments in the call to the \fBMPI_Compare_and_swap\fP function. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler +may be changed with \fBMPI_Comm_set_errhandler\fP; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. diff --git a/ompi/mpi/man/man3/MPI_Fetch_and_op.3in b/ompi/mpi/man/man3/MPI_Fetch_and_op.3in new file mode 100644 index 0000000000..3b5eb68f0c --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Fetch_and_op.3in @@ -0,0 +1,107 @@ +.\" -*- nroff -*- +.\" Copyright 2013-2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2006-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Fetch_and_op 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Fetch_and_op\fP \- Combines the contents of the origin buffer with that of a target buffer and returns the target buffer value. + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Fetch_and_op(const void *\fIorigin_addr\fP, void *\fresult_addr\fP, + MPI_Datatype \fdatatype\fP, int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, + MPI_Op \fIop\fP, MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax (see FORTRAN 77 NOTES) +.nf +INCLUDE 'mpif.h' +MPI_FETCH_AND_OP(\fIORIGIN_ADDR, RESULT_ADDR, DATATYPE, TARGET_RANK, + TARGET_DISP, OP, WIN, IERROR\fP) + \fIORIGIN_ADDR\fP, \fIRESULT_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIDATATYPE, TARGET_RANK, OP, WIN, IERROR \fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +origin_addr +Initial address of buffer (choice). +.ft R +.TP +result_addr +Initial address of result buffer (choice). +.ft R +.TP +datatype +Data type of the entry in origin, result, and target buffers (handle). +.ft R +.TP 1i +target_rank +Rank of target (nonnegative integer). +.ft R +.TP 1i +target_disp +Displacement from start of window to beginning of target buffer (nonnegative integer). +.ft R +.TP 1i +op +Reduce operation (handle). +.ft R +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +Accumulate one element of type \fIdatatype\fP from the origin buffer (\fIorigin_addr\fP) to the buffer at offset \fItarget_disp\fP, in the target window specified by \fItarget_rank\fP and \fIwin\fP, using the operation \fIop\fP and return in the result buffer \fIresult_addr\fP the contents of the target buffer before the accumulation. +.sp +The origin and result buffers (\fIorigin_addr\fP and \fIresult_addr\fP) must be disjoint. Any of the predefined operations for \fBMPI_Rreduce\fP, as well as MPI_NO_OP or MPI_REPLACE, can be specified as \fIop\fP; user-defined functions cannot be used. The \fIdatatype\fP argument must be a predefined datatype. The operation is executed atomically. +.sp +A new predefined operation, MPI_REPLACE, is defined. It corresponds to the associative function f(a, b) =b; that is, the current value in the target memory is replaced by the value supplied by the origin. +.sp +A new predefined operation, MPI_NO_OP, is defined. It corresponds to the assiciative function f(a, b) = a; that is the current value in the target memory is returned in the result buffer at the origin and no operation is performed on the target buffer. + +.SH FORTRAN 77 NOTES +.ft R +The MPI standard prescribes portable Fortran syntax for +the \fITARGET_DISP\fP argument only for Fortran 90. FORTRAN 77 +users may use the non-portable syntax +.sp +.nf + INTEGER*MPI_ADDRESS_KIND \fITARGET_DISP\fP +.fi +.sp +where MPI_ADDRESS_KIND is a constant defined in mpif.h +and gives the length of the declared integer in bytes. + +.SH NOTES +It is the user's responsibility to guarantee that, when +using the accumulate functions, the target displacement argument is such +that accesses to the window are properly aligned according to the data +type arguments in the call to the MPI_Fetch_and_op function. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler +may be changed with \fBMPI_Comm_set_errhandler\fP; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +.ft R +.sp +MPI_Get_accumulate +.br +MPI_Reduce diff --git a/ompi/mpi/man/man3/MPI_Get.3in b/ompi/mpi/man/man3/MPI_Get.3in index 6dc9bc8352..c1e1d32613 100644 --- a/ompi/mpi/man/man3/MPI_Get.3in +++ b/ompi/mpi/man/man3/MPI_Get.3in @@ -1,9 +1,11 @@ +.\" -*- nroff -*- .\" Copyright 2010 Cisco Systems, Inc. All rights reserved. .\" Copyright 2006-2008 Sun Microsystems, Inc. .\" Copyright (c) 1996 Thinking Machines Corporation +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. .TH MPI_Get 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" .SH NAME -\fBMPI_Get\fP \- Copies data from the target memory to the origin. +\fBMPI_Get\fP, \fBMPI_Rget\fP \- Copies data from the target memory to the origin. .SH SYNTAX .ft R @@ -14,6 +16,11 @@ MPI_Get(void *\fIorigin_addr\fP, int \fIorigin_count\fP, MPI_Datatype \fIorigin_datatype\fP, int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, MPI_Datatype \fItarget_datatype\fP, MPI_Win \fIwin\fP) +MPI_Rget(void *\fIorigin_addr\fP, int \fIorigin_count\fP, MPI_Datatype + \fIorigin_datatype\fP, int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, + int \fItarget_count\fP, MPI_Datatype \fItarget_datatype\fP, MPI_Win \fIwin\fP, + MPI_Request *\fIrequest\fP) + .fi .SH Fortran Syntax (see FORTRAN 77 NOTES) .nf @@ -25,6 +32,13 @@ MPI_GET(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, TARGET_COUNT, TARGET_DATATYPE, WIN, IERROR\fP +MPI_RGET(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, + TARGET_DISP, TARGET_COUNT, TARGET_DATATYPE, WIN, REQUEST, IERROR\fP) + \fIORIGIN_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, + TARGET_COUNT, TARGET_DATATYPE, WIN, REQUEST, IERROR\fP + .fi .SH C++ Syntax .nf @@ -64,15 +78,20 @@ window object used for communication (handle) .SH OUTPUT PARAMETER .ft R +.TP li +request +MPI_Rget: RMA request .TP 1i IERROR Fortran only: Error status (integer). .SH DESCRIPTION .ft R -MPI_Get copies data from the target memory to the origin, similar to MPI_Put, except that the direction of data transfer is reversed. The \fIorigin_datatype\fP may not specify overlapping entries in the origin buffer. The target buffer must be contained within the target window, and the copied data must fit, without truncation, in the origin buffer. Only processes within the same node can access the target window. +\fBMPI_Get\fP copies data from the target memory to the origin, similar to MPI_Put, except that the direction of data transfer is reversed. The \fIorigin_datatype\fP may not specify overlapping entries in the origin buffer. The target buffer must be contained within the target window, and the copied data must fit, without truncation, in the origin buffer. Only processes within the same node can access the target window. .sp +\fBMPI_Rget\fP is similar to \fBMPI_Get\fP, except that it allocates a communication request object and associates it with the request handle (the argument \fIrequest\fP) that can be used to wait or test for completion. The completion of an MPI_Rget operation indicates that the data is available in the origin buffer. If \fIorigin_addr\fP points to memory attached to a window, then the data becomes available in the private copy of this window. + .SH FORTRAN 77 NOTES .ft R The MPI standard prescribes portable Fortran syntax for diff --git a/ompi/mpi/man/man3/MPI_Get_accumulate.3in b/ompi/mpi/man/man3/MPI_Get_accumulate.3in new file mode 100644 index 0000000000..30a77e3f81 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Get_accumulate.3in @@ -0,0 +1,159 @@ +.\" -*- nroff -*- +.\" Copyright 2013-2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2006-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Get_accumulate 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Get_accumulate\fP, \fBMPI_Rget_accumulate\fP \- Combines the contents of the origin buffer with that of a target buffer and returns the target buffer value. + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Get_accumulate(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, + MPI_Datatype \fIorigin_datatype\fP, void *\fIresult_addr\fP, + int \fIresult_count\fP, MPI_Datatype \fIresult_datatype\fP, + int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, + MPI_Datatype \fItarget_datatype\fP, MPI_Op \fIop\fP, MPI_Win \fIwin\fP) + +int MPI_Rget_accumulate(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, + MPI_Datatype \fIorigin_datatype\fP, void *\fIresult_addr\fP, + int \fIresult_count\fP, MPI_Datatype \fIresult_datatype\fP, + int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, + MPI_Datatype \fItarget_datatype\fP, MPI_Op \fIop\fP, MPI_Win \fIwin\fP, + MPI_Request *\fIrequest\fP) + +.fi +.SH Fortran Syntax (see FORTRAN 77 NOTES) +.nf +INCLUDE 'mpif.h' +MPI_GET_ACCUMULATE(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, RESULT_ADDR, + RESULT_COUNT, RESULT_DATATYPE, TARGET_RANK, TARGET_DISP, TARGET_COUNT, + TARGET_DATATYPE, OP, WIN, IERROR\fP) + \fIORIGIN_ADDR\fP, \fIRESULT_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_COUNT, TARGET_DATATYPE, + TARGET_RANK, TARGET_COUNT, TARGET_DATATYPE, OP, WIN, IERROR \fP + +MPI_RGET_ACCUMULATE(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, RESULT_ADDR, + RESULT_COUNT, RESULT_DATATYPE, TARGET_RANK, TARGET_DISP, TARGET_COUNT, + TARGET_DATATYPE, OP, WIN, REQUEST, IERROR\fP) + \fIORIGIN_ADDR\fP, \fIRESULT_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_COUNT, TARGET_DATATYPE, + TARGET_RANK, TARGET_COUNT, TARGET_DATATYPE, OP, WIN, REQUEST, IERROR \fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +origin_addr +Initial address of buffer (choice). +.ft R +.TP 1i +origin_count +Number of entries in buffer (nonnegative integer). +.ft R +.TP 1i +origin_datatype +Data type of each buffer entry (handle). +.ft R +.TP +result_addr +Initial address of result buffer (choice). +.ft R +.TP +result_count +Number of entries in result buffer (nonnegative integer). +.ft R +.TP +result_datatype +Data type of each result buffer entry (handle). +.ft R +.TP 1i +target_rank +Rank of target (nonnegative integer). +.ft R +.TP 1i +target_disp +Displacement from start of window to beginning of target buffer (nonnegative integer). +.ft R +.TP 1i +target_count +Number of entries in target buffer (nonnegative integer). +.ft R +.TP 1i +target_datatype +Data type of each entry in target buffer (handle). +.ft R +.TP 1i +op +Reduce operation (handle). +.ft R +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +MPI_Rget_accumulate: RMA request +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +\fBMPI_Get_accumulate\fP is a function used for one-sided MPI communication that adds the contents of the origin buffer (as defined by \fIorigin_addr\fP, \fIorigin_count\fP, and \fIorigin_datatype\fP) to the buffer specified by the arguments \fItarget_count\fP and \fItarget_datatype\fP, at offset \fItarget_disp\fP, in the target window specified by \fItarget_rank\fP and \fIwin\fP, using the operation \fIop\fP. \fBMPI_Get_accumulate\fP returns in the result buffer \fIresult_addr\fP the contents of the target buffer before the accumulation. +.sp +Any of the predefined operations for MPI_Reduce, as well as MPI_NO_OP, can be used. User-defined functions cannot be used. For example, if \fIop\fP is MPI_SUM, each element of the origin buffer is added to the corresponding element in the target, replacing the former value in the target. +.sp +Each datatype argument must be a predefined data type or a derived data type, where all basic components are of the same predefined data type. Both datatype arguments must be constructed from the same predefined data type. The operation \fIop\fP applies to elements of that predefined type. The \fItarget_datatype\fP argument must not specify overlapping entries, and the target buffer must fit in the target window. +.sp +A new predefined operation, MPI_REPLACE, is defined. It corresponds to the associative function f(a, b) =b; that is, the current value in the target memory is replaced by the value supplied by the origin. +.sp +A new predefined operation, MPI_NO_OP, is defined. It corresponds to the assiciative function f(a, b) = a; that is the current value in the target memory is returned in the result buffer at the origin and no operation is performed on the target buffer. +.sp +\fBMPI_Rget_accumulate\fP is similar to \fBMPI_Get_accumulate\fP, except that it allocates a communication request object and associates it with the request handle (the argument \fIrequest\fP) that can be used to wait or test for completion. The completion of an \fBMPI_Rget_accumulate\fP operation indicates that the data is available in the result buffer and the origin buffer is free to be updated. It does not indicate that the operation has been completed at the target window. + +.SH FORTRAN 77 NOTES +.ft R +The MPI standard prescribes portable Fortran syntax for +the \fITARGET_DISP\fP argument only for Fortran 90. FORTRAN 77 +users may use the non-portable syntax +.sp +.nf + INTEGER*MPI_ADDRESS_KIND \fITARGET_DISP\fP +.fi +.sp +where MPI_ADDRESS_KIND is a constant defined in mpif.h +and gives the length of the declared integer in bytes. + +.SH NOTES +The generic functionality of \fBMPI_Get_accumulate\fP might limit the performance of fetch-and-increment or fetch-and-add calls that might be supported by special hardware operations. MPI_Fetch_and_op thus allows for a fast implementation of a commonly used subset of the functionality of \fBMPI_Get_accumulate\fP. +.sp +MPI_Get is a special case of \fBMPI_Get_accumulate\fP, with the operation MPI_NO_OP. Note, however, that MPI_Get and \fBMPI_Get_accumulate\fP have different constraints on concurrent updates. +.sp +It is the user's responsibility to guarantee that, when +using the accumulate functions, the target displacement argument is such +that accesses to the window are properly aligned according to the data +type arguments in the call to the \fBMPI_Get_accumulate\fP function. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler +may be changed with MPI_Comm_set_errhandler; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +.ft R +.sp +MPI_Put +MPI_Get +MPI_Accumulate +MPI_Fetch_and_op +.br +MPI_Reduce diff --git a/ompi/mpi/man/man3/MPI_Put.3in b/ompi/mpi/man/man3/MPI_Put.3in index 307be6d0df..07bfa263f2 100644 --- a/ompi/mpi/man/man3/MPI_Put.3in +++ b/ompi/mpi/man/man3/MPI_Put.3in @@ -1,11 +1,11 @@ .\" -*- nroff -*- -.\" Copyright 2013 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2013-2014 Los Alamos National Security, LLC. All rights reserved. .\" Copyright 2010 Cisco Systems, Inc. All rights reserved. .\" Copyright 2006-2008 Sun Microsystems, Inc. .\" Copyright (c) 1996 Thinking Machines Corporation .TH MPI_Put 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" .SH NAME -\fBMPI_Put\fP \- Copies data from the origin memory to the target. +\fBMPI_Put\fP, \fBMPI_Rput\fP \- Copies data from the origin memory to the target. .SH SYNTAX .ft R @@ -16,6 +16,11 @@ MPI_Put(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, MPI_Datatype \fIorigin_datatype\fP, int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, int \fItarget_count\fP, MPI_Datatype \fItarget_datatype\fP, MPI_Win \fIwin\fP) +MPI_Rput(const void *\fIorigin_addr\fP, int \fIorigin_count\fP, MPI_Datatype + \fIorigin_datatype\fP, int \fItarget_rank\fP, MPI_Aint \fItarget_disp\fP, + int \fItarget_count\fP, MPI_Datatype \fItarget_datatype\fP, MPI_Win \fIwin\fP, + MPI_Request *\fIrequest\fP) + .fi .SH Fortran Syntax (see FORTRAN 77 NOTES) .nf @@ -27,6 +32,13 @@ MPI_PUT(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, TARGET_COUNT, TARGET_DATATYPE, WIN, IERROR\fP +MPI_RPUT(\fIORIGIN_ADDR, ORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, + TARGET_DISP, TARGET_COUNT, TARGET_DATATYPE, WIN, REQUEST, IERROR\fP) + \fIORIGIN_ADDR\fP(*) + INTEGER(KIND=MPI_ADDRESS_KIND) \fITARGET_DISP\fP + INTEGER \fIORIGIN_COUNT, ORIGIN_DATATYPE, TARGET_RANK, TARGET_COUNT, + TARGET_DATATYPE, WIN, REQUEST, IERROR\fP + .fi .SH C++ Syntax .nf @@ -67,12 +79,15 @@ Window object used for communication (handle). .SH OUTPUT PARAMETER .ft R .TP 1i +request +MPI_Rput: RMA request +.TP 1i IERROR Fortran only: Error status (integer). .SH DESCRIPTION .ft R -MPI_Put transfers \fIorigin_count\fP successive entries of the type specified by \fIorigin_datatype\fP, starting at address \fIorigin_addr\fP on the origin node to the target node specified by the \fIwin\fP, \fItarget_rank\fP pair. The data are written in the target buffer at address \fItarget_addr\fP = \fIwindow_base\fP + \fItarget_disp\fP x \fIdisp_unit\fP, where \fIwindow_base\fP and \fIdisp_unit\fP are the base address and window displacement unit specified at window initialization, by the target process. +\fBMPI_Put\fP transfers \fIorigin_count\fP successive entries of the type specified by \fIorigin_datatype\fP, starting at address \fIorigin_addr\fP on the origin node to the target node specified by the \fIwin\fP, \fItarget_rank\fP pair. The data are written in the target buffer at address \fItarget_addr\fP = \fIwindow_base\fP + \fItarget_disp\fP x \fIdisp_unit\fP, where \fIwindow_base\fP and \fIdisp_unit\fP are the base address and window displacement unit specified at window initialization, by the target process. .sp The target buffer is specified by the arguments \fItarget_count\fP and \fItarget_datatype\fP. .sp @@ -81,6 +96,8 @@ The data transfer is the same as that which would occur if the origin process ex The communication must satisfy the same constraints as for a similar message-passing communication. The \fItarget_datatype\fP may not specify overlapping entries in the target buffer. The message sent must fit, without truncation, in the target buffer. Furthermore, the target buffer must fit in the target window. In addition, only processes within the same buffer can access the target window. .sp The \fItarget_datatype\fP argument is a handle to a datatype object defined at the origin process. However, this object is interpreted at the target process: The outcome is as if the target datatype object were defined at the target process, by the same sequence of calls used to define it at the origin process. The target data type must contain only relative displacements, not absolute addresses. The same holds for get and accumulate. +.sp +\fBMPI_Rput\bP is similar to \fBMPI_Put\fI, except that it allocates a communication request object and associates it with the request handle (the argument \fIrequest\fP). The completion of an MPI_Rput operation (i.e., after the corresponding test or wait) indicates that the sender is now free to update the locations in the \fIorigin_addr\fP buffer. It does not indicate that the data is available at the target window. If remote completion is required, \fBMPI_Win_flush\fP, \fBMPI_Win_flush_all\fP, \fBMPI_Win_unlock\fP, or \fBMPI_Win_unlock_all\fP can be used. .SH NOTES The \fItarget_datatype\fP argument is a handle to a datatype object that is defined at the origin process, even though it defines a data layout in the target process memory. This does not cause problems in a homogeneous or heterogeneous environment, as long as only portable data types are used (portable data types are defined in Section 2.4 of the MPI-2 Standard). @@ -110,6 +127,11 @@ called. By default, this error handler aborts the MPI job, except for I/O functi .ft R .sp MPI_Get +MPI_Rget .br MPI_Accumulate +MPI_Win_flush +MPI_Win_flush_all +MPI_Win_unlock +MPI_Win_unlock_all diff --git a/ompi/mpi/man/man3/MPI_Raccumulate.3in b/ompi/mpi/man/man3/MPI_Raccumulate.3in new file mode 100644 index 0000000000..d1e293cc85 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Raccumulate.3in @@ -0,0 +1 @@ +.so man3/MPI_Accumulate.3 diff --git a/ompi/mpi/man/man3/MPI_Rget.3in b/ompi/mpi/man/man3/MPI_Rget.3in new file mode 100644 index 0000000000..4b4410dd0d --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Rget.3in @@ -0,0 +1 @@ +.so man3/MPI_Get.3 diff --git a/ompi/mpi/man/man3/MPI_Rget_accumulate.3in b/ompi/mpi/man/man3/MPI_Rget_accumulate.3in new file mode 100644 index 0000000000..86db553607 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Rget_accumulate.3in @@ -0,0 +1 @@ +.so man3/MPI_Get_accumulate.3 diff --git a/ompi/mpi/man/man3/MPI_Rput.3in b/ompi/mpi/man/man3/MPI_Rput.3in new file mode 100644 index 0000000000..52e806a2e8 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Rput.3in @@ -0,0 +1 @@ +.so man3/MPI_Put.3 diff --git a/ompi/mpi/man/man3/MPI_Win_create.3in b/ompi/mpi/man/man3/MPI_Win_create.3in index d735d6c605..5ebd5c90b3 100644 --- a/ompi/mpi/man/man3/MPI_Win_create.3in +++ b/ompi/mpi/man/man3/MPI_Win_create.3in @@ -66,14 +66,33 @@ If the \fIbase\fP value used by MPI_Win_create was allocated by MPI_Alloc_mem, t .sp The displacement unit argument is provided to facilitate address arithmetic in RMA operations: the target displacement argument of an RMA operation is scaled by the factor \fIdisp_unit\fP specified by the target process, at window creation. .sp -The following info key is supported: +The following info keys are supported: .ft R .TP 1i no_locks -If MPI_Win_lock is called on a window created with this info key, the -call will fail. If this info key is present, it is assumed that the -local window is never locked, allowing several internal checks to be -skipped, permitting a more efficient implementation. +If set to \fItrue\fP, then the implementation may assume that the local +window is never locked (by a call to MPI_Win_lock or +MPI_Win_lock_all). Setting this value if only active synchronization +may allow the implementation to enable certain optimizations. +.sp +.TP 1i +accumulate_ordering +By default, accumulate operations from one initiator to one target on +the same window are strictly ordered. If the info key +accumulate_ordering is set to \fInone\fP, no ordering of accumulate +operations guaranteed. They key can also be a comma-separated list of +required orderings consisting of \fIrar\fP, \fIwar\fP, \fIraw\fP, and \fIwaw\fP for +read-after-read, write-after-read, read-after-write, and +write-after-write, respectively. Looser ordering constraints are +likely to result in improved performance. +.sp +.TP 1i +accumulate_ops +If set to \fIsame_op\fP, the implementation will assume that all concurrent +accumulate calls to the same target address will use the same +operation. If set to \fIsame_op_no_op\fP, then the implementation will +assume that all concurrent accumulate calls to the same target address +will use the same operation or MPI_NO_OP. The default is \fIsame_op_no_op\fP. .sp .SH NOTES diff --git a/ompi/mpi/man/man3/MPI_Win_flush.3in b/ompi/mpi/man/man3/MPI_Win_flush.3in new file mode 100644 index 0000000000..ce47f2b313 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_flush.3in @@ -0,0 +1,61 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Win_flush 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Win_flush\fP, \fBMPI_Win_flush_all\fP \- Complete all outstanding RMA operations at both the origin and the target + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Win_flush (int \fIrank\fP, MPI_Win \fIwin\fP) + +int MPI_Win_flush_all (MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax +.nf +INCLUDE 'mpif.h' +MPI_WIN_FLUSH(\fIRANK, WIN, IERROR\fP) + INTEGER \fIRANK, WIN, IERROR\fP + +MPI_WIN_FLUSH_ALL(\fWIN, IERROR\fP) + INTEGER \fIWIN, IERROR\fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +rank +Rank of window (nonnegative integer). +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +\fBMPI_Win_flush\fP completes all outstanding RMA operations initiated by the calling process to the target rank on the specified window. The operations are completed both at the origin and at the target. \fBMPI_Win_flush_all\fP completes all outstanding RMA operations to all targets. +.sp +Can only be called from within a passive target epoch. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with \fBMPI_Comm_set_errhandler\fP; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +MPI_Win_flush_local +MPI_Win_lock +MPI_Win_lock_all +.br diff --git a/ompi/mpi/man/man3/MPI_Win_flush_all.3in b/ompi/mpi/man/man3/MPI_Win_flush_all.3in new file mode 100644 index 0000000000..b30e345a52 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_flush_all.3in @@ -0,0 +1 @@ +.so man3/MPI_Win_flush.3 diff --git a/ompi/mpi/man/man3/MPI_Win_flush_local.3in b/ompi/mpi/man/man3/MPI_Win_flush_local.3in new file mode 100644 index 0000000000..a507897e42 --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_flush_local.3in @@ -0,0 +1,61 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Win_flush_local 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Win_flush_local\fP, \fBMPI_Win_flush_local_all\fP \- Complete all outstanding RMA operations at both the origin + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Win_flush_local (int \fIrank\fP, MPI_Win \fIwin\fP) + +int MPI_Win_flush_local_all (MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax +.nf +INCLUDE 'mpif.h' +MPI_WIN_FLUSH_LOCAL(\fIRANK, WIN, IERROR\fP) + INTEGER \fIRANK, WIN, IERROR\fP + +MPI_WIN_FLUSH_LOCAL_ALL(\fWIN, IERROR\fP) + INTEGER \fIWIN, IERROR\fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +rank +Rank of window (nonnegative integer). +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +\fBMPI_Win_flush_local\fP locally completes at the origin all outstanding RMA operations initiated by the calling process to the target process specified by rank on the specified window. For example, after this routine completes, the user may reuse any buffers provided to put, get, or accumulate operations. \fBMPI_Win_flush_local_all\fP locally completes at the origin all outstanding RMA operations to all targets. +.sp +Can only be called from within a passive target epoch. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with \fBMPI_Comm_set_errhandler\fP; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +MPI_Win_flush +MPI_Win_lock +MPI_Win_lock_all +.br diff --git a/ompi/mpi/man/man3/MPI_Win_flush_local_all.3in b/ompi/mpi/man/man3/MPI_Win_flush_local_all.3in new file mode 100644 index 0000000000..6b740a2b3a --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_flush_local_all.3in @@ -0,0 +1 @@ +.so man3/MPI_Win_flush_local.3 diff --git a/ompi/mpi/man/man3/MPI_Win_lock.3in b/ompi/mpi/man/man3/MPI_Win_lock.3in index fd3e1fb30c..4c9e922462 100644 --- a/ompi/mpi/man/man3/MPI_Win_lock.3in +++ b/ompi/mpi/man/man3/MPI_Win_lock.3in @@ -1,3 +1,5 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. .\" Copyright 2010 Cisco Systems, Inc. All rights reserved. .\" Copyright 2007-2008 Sun Microsystems, Inc. .\" Copyright (c) 1996 Thinking Machines Corporation @@ -74,5 +76,6 @@ called. By default, this error handler aborts the MPI job, except for I/O functi .SH SEE ALSO MPI_Win_unlock +MPI_Win_lock_all .br diff --git a/ompi/mpi/man/man3/MPI_Win_lock_all.3in b/ompi/mpi/man/man3/MPI_Win_lock_all.3in new file mode 100644 index 0000000000..ef0d682e4b --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_lock_all.3in @@ -0,0 +1,68 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Win_lock_all 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Win_lock_all\fP \- Starts an RMA access epoch locking access to all processes in the window + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Win_lock_all(int \fIassert\fP, MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax +.nf +INCLUDE 'mpif.h' +MPI_WIN_LOCK(\fIASSERT, WIN, IERROR\fP) + INTEGER \fIASSERT, WIN, IERROR\fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +assert +Program assertion (integer). +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +Starts an RMA access epoch to all processes in \fIwin\fP, with a lock type of MPI_LOCK_SHARED. During the epoch, the calling process can access the window memory on all processes in \fIwin\fP by using RMA operations. A window locked with MPI_Win_lock_all must be unlocked with MPI_Win_unlock_all. This routine is not collective — the ALL refers to a lock on all members of the group of the window. +.sp +Locks are used to protect accesses to the locked target window effected by RMA calls issued between the lock and unlock call, and to protect local load/store accesses to a locked local window executed between the lock and unlock call. +Accesses that are protected by an exclusive lock will not be concurrent at the window site with other accesses to the same window that are lock protected. Accesses that are protected by a shared lock will not be concurrent at the window site with accesses protected by an exclusive lock to the same window. +.sp +The \fIassert\fP argument is used to provide assertions on the context of the call that may be used for various optimizations. (See Section 6.4.4 of the MPI-2 Standard.) A value of \fIassert\fP = 0 is always valid. +.sp +.ft +.SH NOTES +.ft R +In a client/server environment in which clients connect to +a server and create windows that span both the client and the +server, if a client or server that has obtained a lock +on such a window and then terminates abnormally, the server or other clients +may hang in a MPI_Win_lock_all call, failing to notice that the peer MPI job +has terminated. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with MPI_Comm_set_errhandler; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +MPI_Win_unlock_all +MPI_Win_lock +.br diff --git a/ompi/mpi/man/man3/MPI_Win_sync.3in b/ompi/mpi/man/man3/MPI_Win_sync.3in new file mode 100644 index 0000000000..280153c36f --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_sync.3in @@ -0,0 +1,45 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Win_sync 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Win_sync\fP, \- Synchronize the private and public copies of the window + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Win_sync (MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax +.nf +INCLUDE 'mpif.h' +MPI_WIN_SYNC(\fIWIN, IERROR\fP) + INTEGER \fIWIN, IERROR\fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +\fBMPI_Win_sync\fP synchronizes the private and public window copies of \fIwin\fP. For the purposes of synchronizing the private and public window, \fBMPI_Win_sync\fP has the effect of ending and reopening an access and exposure epoch on the window (note that it does not actually end an epoch or complete any pending MPI RMA operations). + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with \fBMPI_Comm_set_errhandler\fP; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. diff --git a/ompi/mpi/man/man3/MPI_Win_unlock.3in b/ompi/mpi/man/man3/MPI_Win_unlock.3in index 6eb8bf1822..a73eb90f47 100644 --- a/ompi/mpi/man/man3/MPI_Win_unlock.3in +++ b/ompi/mpi/man/man3/MPI_Win_unlock.3in @@ -1,3 +1,5 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. .\" Copyright 2010 Cisco Systems, Inc. All rights reserved. .\" Copyright 2007-2008 Sun Microsystems, Inc. .\" Copyright (c) 1996 Thinking Machines Corporation @@ -55,6 +57,7 @@ called. By default, this error handler aborts the MPI job, except for I/O functi .SH SEE ALSO MPI_Win_lock +MPI_Win_unlock_all .br diff --git a/ompi/mpi/man/man3/MPI_Win_unlock_all.3in b/ompi/mpi/man/man3/MPI_Win_unlock_all.3in new file mode 100644 index 0000000000..414a36ec4e --- /dev/null +++ b/ompi/mpi/man/man3/MPI_Win_unlock_all.3in @@ -0,0 +1,52 @@ +.\" -*- nroff -*- +.\" Copyright 2014 Los Alamos National Security, LLC. All rights reserved. +.\" Copyright 2010 Cisco Systems, Inc. All rights reserved. +.\" Copyright 2007-2008 Sun Microsystems, Inc. +.\" Copyright (c) 1996 Thinking Machines Corporation +.TH MPI_Win_unlock_all 3 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.SH NAME +\fBMPI_Win_unlock_all\fP \- Completes an RMA access epoch started by a call to MPI_Win_lock_all. + +.SH SYNTAX +.ft R +.SH C Syntax +.nf +#include +int MPI_Win_unlock_all(MPI_Win \fIwin\fP) + +.fi +.SH Fortran Syntax +.nf +INCLUDE 'mpif.h' +MPI_WIN_UNLOCK_ALL(\fWIN, IERROR\fP) + INTEGER \fWIN, IERROR\fP + +.fi +.SH INPUT PARAMETERS +.ft R +.TP 1i +win +Window object (handle). + +.SH OUTPUT PARAMETER +.ft R +.TP 1i +IERROR +Fortran only: Error status (integer). + +.SH DESCRIPTION +.ft R +MPI_Win_unlock_all completes an RMA access epoch started by a call to MPI_Win_lock_all. RMA operations issued during this period will have completed both at the origin and at the target when the call returns. +.sp +Locks are used to protect accesses to the locked target window effected by RMA calls issued between the lock and unlock call, and to protect local load/store accesses to a locked local window executed between the lock and unlock call. Accesses that are protected by an exclusive lock will not be concurrent at the window site with other accesses to the same window that are lock protected. Accesses that are protected by a shared lock will not be concurrent at the window site with accesses protected by an exclusive lock to the same window. + +.SH ERRORS +Almost all MPI routines return an error value; C routines as the value of the function and Fortran routines in the last argument. +.sp +Before the error value is returned, the current MPI error handler is +called. By default, this error handler aborts the MPI job, except for I/O function errors. The error handler may be changed with MPI_Comm_set_errhandler; the predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarantee that an MPI program can continue past an error. + +.SH SEE ALSO +MPI_Win_lock_all +MPI_Win_unlock +.br diff --git a/ompi/mpi/man/man3/Makefile.extra b/ompi/mpi/man/man3/Makefile.extra index 0b16fb21dc..edf56a7142 100644 --- a/ompi/mpi/man/man3/Makefile.extra +++ b/ompi/mpi/man/man3/Makefile.extra @@ -87,6 +87,7 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Comm_split.3 \ mpi/man/man3/MPI_Comm_split_type.3 \ mpi/man/man3/MPI_Comm_test_inter.3 \ + mpi/man/man3/MPI_Compare_and_swap.3 \ mpi/man/man3/MPI_Dims_create.3 \ mpi/man/man3/MPI_Dist_graph_create.3 \ mpi/man/man3/MPI_Dist_graph_create_adjacent.3 \ @@ -100,6 +101,7 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Error_string.3 \ mpi/man/man3/MPI_Exscan.3 \ mpi/man/man3/MPI_Iexscan.3 \ + mpi/man/man3/MPI_Fetch_and_op.3 \ mpi/man/man3/MPI_File_c2f.3 \ mpi/man/man3/MPI_File_call_errhandler.3 \ mpi/man/man3/MPI_File_close.3 \ @@ -165,6 +167,7 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Gatherv.3 \ mpi/man/man3/MPI_Igatherv.3 \ mpi/man/man3/MPI_Get.3 \ + mpi/man/man3/MPI_Get_accumulate.3 \ mpi/man/man3/MPI_Get_address.3 \ mpi/man/man3/MPI_Get_count.3 \ mpi/man/man3/MPI_Get_elements.3 \ @@ -249,6 +252,7 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Publish_name.3 \ mpi/man/man3/MPI_Put.3 \ mpi/man/man3/MPI_Query_thread.3 \ + mpi/man/man3/MPI_Raccumulate.3 \ mpi/man/man3/MPI_Recv.3 \ mpi/man/man3/MPI_Recv_init.3 \ mpi/man/man3/MPI_Reduce.3 \ @@ -263,6 +267,9 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Request_f2c.3 \ mpi/man/man3/MPI_Request_free.3 \ mpi/man/man3/MPI_Request_get_status.3 \ + mpi/man/man3/MPI_Rget.3 \ + mpi/man/man3/MPI_Rget_accumulate.3 \ + mpi/man/man3/MPI_Rput.3 \ mpi/man/man3/MPI_Rsend.3 \ mpi/man/man3/MPI_Rsend_init.3 \ mpi/man/man3/MPI_Scan.3 \ @@ -376,6 +383,10 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Win_delete_attr.3 \ mpi/man/man3/MPI_Win_f2c.3 \ mpi/man/man3/MPI_Win_fence.3 \ + mpi/man/man3/MPI_Win_flush.3 \ + mpi/man/man3/MPI_Win_flush_all.3 \ + mpi/man/man3/MPI_Win_flush_local.3 \ + mpi/man/man3/MPI_Win_flush_local_all.3 \ mpi/man/man3/MPI_Win_free.3 \ mpi/man/man3/MPI_Win_free_keyval.3 \ mpi/man/man3/MPI_Win_get_attr.3 \ @@ -383,13 +394,16 @@ mpi_api_man_pages = \ mpi/man/man3/MPI_Win_get_group.3 \ mpi/man/man3/MPI_Win_get_name.3 \ mpi/man/man3/MPI_Win_lock.3 \ + mpi/man/man3/MPI_Win_lock_all.3 \ mpi/man/man3/MPI_Win_post.3 \ mpi/man/man3/MPI_Win_set_attr.3 \ mpi/man/man3/MPI_Win_set_errhandler.3 \ mpi/man/man3/MPI_Win_set_name.3 \ mpi/man/man3/MPI_Win_start.3 \ + mpi/man/man3/MPI_Win_sync.3 \ mpi/man/man3/MPI_Win_test.3 \ mpi/man/man3/MPI_Win_unlock.3 \ + mpi/man/man3/MPI_Win_unlock_all.3 \ mpi/man/man3/MPI_Win_wait.3 \ mpi/man/man3/MPI_Wtick.3 \ mpi/man/man3/MPI_Wtime.3 \ diff --git a/ompi/op/op.c b/ompi/op/op.c index 97cd708eb0..74bce2409a 100644 --- a/ompi/op/op.c +++ b/ompi/op/op.c @@ -89,6 +89,8 @@ ompi_predefined_op_t ompi_mpi_op_minloc; ompi_predefined_op_t *ompi_mpi_op_minloc_addr = &ompi_mpi_op_minloc; ompi_predefined_op_t ompi_mpi_op_replace; ompi_predefined_op_t *ompi_mpi_op_replace_addr = &ompi_mpi_op_replace; +ompi_predefined_op_t ompi_mpi_op_no_op; +ompi_predefined_op_t *ompi_mpi_op_no_op_addr = &ompi_mpi_op_no_op; /* * Map from ddt->id to position in op function pointer array @@ -255,7 +257,10 @@ int ompi_op_init(void) FLAGS, "MPI_MINLOC") || OMPI_SUCCESS != add_intrinsic(&ompi_mpi_op_replace.op, OMPI_OP_BASE_FORTRAN_REPLACE, - FLAGS, "MPI_REPLACE")) { + FLAGS, "MPI_REPLACE") || + OMPI_SUCCESS != + add_intrinsic(&ompi_mpi_op_no_op.op, OMPI_OP_BASE_FORTRAN_NO_OP, + FLAGS, "MPI_NO_OP")) { return OMPI_ERROR; }else{ /* This code is placed back here to support @@ -289,6 +294,7 @@ int ompi_op_init(void) int ompi_op_finalize(void) { /* clean up the intrinsic ops */ + OBJ_DESTRUCT(&ompi_mpi_op_no_op); OBJ_DESTRUCT(&ompi_mpi_op_replace); OBJ_DESTRUCT(&ompi_mpi_op_minloc); OBJ_DESTRUCT(&ompi_mpi_op_maxloc); @@ -419,7 +425,8 @@ static int add_intrinsic(ompi_op_t *op, int fort_handle, int flags, pointers (except for NULL and REPLACE, which don't get components) */ if (OMPI_OP_BASE_FORTRAN_NULL != op->o_f_to_c_index && - OMPI_OP_BASE_FORTRAN_REPLACE != op->o_f_to_c_index) { + OMPI_OP_BASE_FORTRAN_REPLACE != op->o_f_to_c_index && + OMPI_OP_BASE_FORTRAN_NO_OP != op->o_f_to_c_index) { return ompi_op_base_op_select(op); } else { return OMPI_SUCCESS; diff --git a/ompi/op/op.h b/ompi/op/op.h index b6e2b84d74..979384e2a6 100644 --- a/ompi/op/op.h +++ b/ompi/op/op.h @@ -322,6 +322,11 @@ OMPI_DECLSPEC extern ompi_predefined_op_t *ompi_mpi_op_minloc_addr; OMPI_DECLSPEC extern ompi_predefined_op_t ompi_mpi_op_replace; OMPI_DECLSPEC extern ompi_predefined_op_t *ompi_mpi_op_replace_addr; +/** + * Global variable for MPI_NO_OP + */ +OMPI_DECLSPEC extern ompi_predefined_op_t ompi_mpi_op_no_op; + /** * Table for Fortran <-> C op handle conversion diff --git a/ompi/win/win.c b/ompi/win/win.c index f4fb89bdb8..5fca9ed69f 100644 --- a/ompi/win/win.c +++ b/ompi/win/win.c @@ -80,6 +80,61 @@ ompi_win_finalize(void) return OMPI_SUCCESS; } +static ompi_win_t * +alloc_window(struct ompi_communicator_t *comm) +{ + ompi_win_t *win; + ompi_group_t *group; + + /* create the object */ + win = OBJ_NEW(ompi_win_t); + if (NULL == win) return NULL; + + /* setup data that is independent of osc component */ + group = comm->c_local_group; + OBJ_RETAIN(group); + ompi_group_increment_proc_count(group); + win->w_group = group; + + return win; +} + +static int +config_window(void *base, size_t size, int disp_unit, + int flavor, int model, ompi_win_t *win) +{ + int ret; + + ret = ompi_attr_set_c(WIN_ATTR, win, &win->w_keyhash, + MPI_WIN_BASE, base, true); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, + &win->w_keyhash, + MPI_WIN_SIZE, size, true); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, + &win->w_keyhash, + MPI_WIN_DISP_UNIT, disp_unit, + true); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, + &win->w_keyhash, + MPI_WIN_CREATE_FLAVOR, flavor, true); + if (OMPI_SUCCESS != ret) return ret; + + ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, + &win->w_keyhash, + MPI_WIN_MODEL, model, true); + if (OMPI_SUCCESS != ret) return ret; + + win->w_f_to_c_index = opal_pointer_array_add(&ompi_mpi_windows, win); + if (-1 == win->w_f_to_c_index) return OMPI_ERR_OUT_OF_RESOURCE; + + return OMPI_SUCCESS; +} int ompi_win_create(void *base, size_t size, @@ -88,58 +143,112 @@ ompi_win_create(void *base, size_t size, ompi_win_t** newwin) { ompi_win_t *win; - ompi_group_t *group; + int model; int ret; - /* create the object */ - win = OBJ_NEW(ompi_win_t); - if (NULL == win) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + win = alloc_window(comm); + if (NULL == win) return OMPI_ERR_OUT_OF_RESOURCE; - /* setup data that is independent of osc component */ - group = comm->c_local_group; - OBJ_RETAIN(group); - ompi_group_increment_proc_count(group); - win->w_group = group; - - win->w_baseptr = base; - win->w_size = size; - win->w_disp_unit = disp_unit; - - /* Fill in required attributes */ - ret = ompi_attr_set_c(WIN_ATTR, win, &win->w_keyhash, - MPI_WIN_BASE, win->w_baseptr, true); - if (OMPI_SUCCESS != ret) { - OBJ_RELEASE(win); - return ret; - } - ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, - &win->w_keyhash, - MPI_WIN_SIZE, win->w_size, true); - if (OMPI_SUCCESS != ret) { - OBJ_RELEASE(win); - return ret; - } - ret = ompi_attr_set_fortran_mpi2(WIN_ATTR, win, - &win->w_keyhash, - MPI_WIN_DISP_UNIT, win->w_disp_unit, - true); + ret = ompi_osc_base_select(win, &base, size, disp_unit, comm, info, MPI_WIN_FLAVOR_CREATE, &model); if (OMPI_SUCCESS != ret) { OBJ_RELEASE(win); return ret; } - /* create backend onesided module for this window */ - ret = ompi_osc_base_select(win, (ompi_info_t*) info, comm); + ret = config_window(base, size, disp_unit, MPI_WIN_FLAVOR_CREATE, model, win); if (OMPI_SUCCESS != ret) { OBJ_RELEASE(win); return ret; } - /* fill in Fortran index */ - win->w_f_to_c_index = opal_pointer_array_add(&ompi_mpi_windows, win); - if (-1 == win->w_f_to_c_index) { - ompi_win_free(win); - return OMPI_ERR_OUT_OF_RESOURCE; + *newwin = win; + + return OMPI_SUCCESS; +} + + +int +ompi_win_allocate(size_t size, int disp_unit, ompi_info_t *info, + ompi_communicator_t *comm, void *baseptr, ompi_win_t **newwin) +{ + ompi_win_t *win; + int model; + int ret; + void *base; + + win = alloc_window(comm); + if (NULL == win) return OMPI_ERR_OUT_OF_RESOURCE; + + ret = ompi_osc_base_select(win, &base, size, disp_unit, comm, info, MPI_WIN_FLAVOR_ALLOCATE, &model); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; + } + + ret = config_window(base, size, disp_unit, MPI_WIN_FLAVOR_ALLOCATE, model, win); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; + } + + *((void**) baseptr) = base; + *newwin = win; + + return OMPI_SUCCESS; +} + + +int +ompi_win_allocate_shared(size_t size, int disp_unit, ompi_info_t *info, + ompi_communicator_t *comm, void *baseptr, ompi_win_t **newwin) +{ + ompi_win_t *win; + int model; + int ret; + void *base; + + win = alloc_window(comm); + if (NULL == win) return OMPI_ERR_OUT_OF_RESOURCE; + + ret = ompi_osc_base_select(win, &base, size, disp_unit, comm, info, MPI_WIN_FLAVOR_SHARED, &model); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; + } + + ret = config_window(base, size, disp_unit, MPI_WIN_FLAVOR_SHARED, model, win); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; + } + + *((void**) baseptr) = base; + *newwin = win; + + return OMPI_SUCCESS; +} + + +int +ompi_win_create_dynamic(ompi_info_t *info, ompi_communicator_t *comm, ompi_win_t **newwin) +{ + ompi_win_t *win; + int model; + int ret; + + win = alloc_window(comm); + if (NULL == win) return OMPI_ERR_OUT_OF_RESOURCE; + + ret = ompi_osc_base_select(win, MPI_BOTTOM, 0, 1, comm, info, MPI_WIN_FLAVOR_DYNAMIC, &model); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; + } + + ret = config_window(MPI_BOTTOM, 0, 1, MPI_WIN_FLAVOR_DYNAMIC, model, win); + if (OMPI_SUCCESS != ret) { + OBJ_RELEASE(win); + return ret; } *newwin = win; @@ -217,11 +326,7 @@ ompi_win_construct(ompi_win_t *win) win->error_handler = &ompi_mpi_errors_are_fatal.eh; win->errhandler_type = OMPI_ERRHANDLER_TYPE_WIN; - win->w_disp_unit = 0; win->w_flags = 0; - win->w_mode = 0; - win->w_baseptr = NULL; - win->w_size = 0; win->w_osc_module = NULL; } diff --git a/ompi/win/win.h b/ompi/win/win.h index aa5f33acc6..7a8baec0d1 100644 --- a/ompi/win/win.h +++ b/ompi/win/win.h @@ -42,14 +42,6 @@ BEGIN_C_DECLS #define OMPI_WIN_INVALID 0x00000002 #define OMPI_WIN_NO_LOCKS 0x00000004 -/* mode */ -#define OMPI_WIN_ACCESS_EPOCH 0x00000001 -#define OMPI_WIN_EXPOSE_EPOCH 0x00000002 -#define OMPI_WIN_FENCE 0x00000010 -#define OMPI_WIN_POSTED 0x00000020 -#define OMPI_WIN_STARTED 0x00000040 -#define OMPI_WIN_LOCK_ACCESS 0x00000080 - OMPI_DECLSPEC extern opal_pointer_array_t ompi_mpi_windows; struct ompi_win_t { @@ -77,17 +69,6 @@ struct ompi_win_t { ompi_errhandler_t *error_handler; ompi_errhandler_type_t errhandler_type; - /* displacement factor */ - int w_disp_unit; - - void *w_baseptr; - size_t w_size; - - /** Current epoch / mode (access, expose, lock, etc.). Checked by - the argument checking code in the MPI layer, set by the OSC - component. Modified without locking w_lock. */ - volatile uint16_t w_mode; - /* one sided interface */ ompi_osc_base_module_t *w_osc_module; }; @@ -116,6 +97,11 @@ int ompi_win_finalize(void); int ompi_win_create(void *base, size_t size, int disp_unit, ompi_communicator_t *comm, ompi_info_t *info, ompi_win_t **newwin); +int ompi_win_allocate(size_t size, int disp_unit, ompi_info_t *info, + ompi_communicator_t *comm, void *baseptr, ompi_win_t **newwin); +int ompi_win_allocate_shared(size_t size, int disp_unit, ompi_info_t *info, + ompi_communicator_t *comm, void *baseptr, ompi_win_t **newwin); +int ompi_win_create_dynamic(ompi_info_t *info, ompi_communicator_t *comm, ompi_win_t **newwin); int ompi_win_free(ompi_win_t *win); @@ -151,48 +137,5 @@ static inline bool ompi_win_allow_locks(ompi_win_t *win) { return (0 == (win->w_flags & OMPI_WIN_NO_LOCKS)); } -static inline int16_t ompi_win_get_mode(ompi_win_t *win) { - int16_t mode = win->w_mode; - opal_atomic_rmb(); - return mode; -} - -static inline void ompi_win_set_mode(ompi_win_t *win, int16_t mode) { - win->w_mode = mode; - opal_atomic_wmb(); -} - -static inline void ompi_win_append_mode(ompi_win_t *win, int16_t mode) { - win->w_mode |= mode; - opal_atomic_wmb(); -} - -static inline void ompi_win_remove_mode(ompi_win_t *win, - int16_t mode) -{ - win->w_mode &= ~mode; - opal_atomic_wmb(); -} - -/* already in an access epoch */ -static inline bool ompi_win_access_epoch(ompi_win_t *win) { - int16_t mode = ompi_win_get_mode(win); - return (0 != (OMPI_WIN_ACCESS_EPOCH & mode) ? true : false); -} - -/* already in an exposure epoch */ -static inline bool ompi_win_exposure_epoch(ompi_win_t *win) { - int16_t mode = ompi_win_get_mode(win); - return (0 != (OMPI_WIN_EXPOSE_EPOCH & mode) ? true : false); -} - -/* we're either already in an access epoch or can easily start one - (stupid fence rule). Either way, it's ok to be the origin of a - communication call. */ -static inline bool ompi_win_comm_allowed(ompi_win_t *win) { - int16_t mode = ompi_win_get_mode(win); - return (0 != (OMPI_WIN_ACCESS_EPOCH & mode || OMPI_WIN_FENCE & mode) ? true : false); -} - END_C_DECLS #endif