OSHMEM: exchange mxm(ikrit) endpoints via MPI_Allgather, code cleanup, remove unused
Refs trac:3763 This commit was SVN r30089. The following Trac tickets were found above: Ticket 3763 --> https://svn.open-mpi.org/trac/ompi/ticket/3763
Этот коммит содержится в:
родитель
e2f372ac4b
Коммит
92cf175e9e
@ -594,7 +594,7 @@ void mca_memheap_modex_recv_all(void)
|
||||
int rc;
|
||||
|
||||
if (!mca_memheap_base_key_exchange) {
|
||||
MPI_Barrier(oshmem_comm_world);
|
||||
oshmem_shmem_barrier();
|
||||
return;
|
||||
}
|
||||
|
||||
@ -622,9 +622,7 @@ void mca_memheap_modex_recv_all(void)
|
||||
oshmem_shmem_abort(-1);
|
||||
}
|
||||
|
||||
rc = MPI_Allgather(send_buffer, size, MPI_BYTE,
|
||||
rcv_buffer, size, MPI_BYTE, oshmem_comm_world);
|
||||
|
||||
rc = oshmem_shmem_allgather(send_buffer, rcv_buffer, size);
|
||||
if (MPI_SUCCESS != rc) {
|
||||
MEMHEAP_ERROR("allgather failed");
|
||||
oshmem_shmem_abort(-1);
|
||||
|
@ -365,6 +365,7 @@ int mca_spml_ikrit_del_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
{
|
||||
spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL;
|
||||
spml_ikrit_mxm_ep_conn_info_t my_ep_info;
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
mxm_conn_req_t *conn_reqs;
|
||||
int timeout;
|
||||
@ -403,15 +404,15 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
|
||||
#if MXM_API < MXM_VERSION(2,0)
|
||||
if (OSHMEM_SUCCESS
|
||||
!= spml_ikrit_get_ep_address(&ep_info[my_rank], MXM_PTL_SELF)) {
|
||||
!= spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) {
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
if (OSHMEM_SUCCESS
|
||||
!= spml_ikrit_get_ep_address(&ep_info[my_rank], MXM_PTL_RDMA)) {
|
||||
!= spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) {
|
||||
return OSHMEM_ERROR;
|
||||
}
|
||||
#else
|
||||
err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, ep_info[my_rank].addr.ep_addr, &mxm_addr_len);
|
||||
err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
|
||||
if (MXM_OK != err) {
|
||||
orte_show_help("help-shmem-spml-ikrit.txt", "unable to get endpoint address", true,
|
||||
mxm_error_string(err));
|
||||
@ -421,7 +422,7 @@ int mca_spml_ikrit_add_procs(oshmem_proc_t** procs, size_t nprocs)
|
||||
|
||||
opal_progress_register(spml_ikrit_progress);
|
||||
|
||||
oshmem_shmem_exchange_allgather(ep_info,
|
||||
oshmem_shmem_allgather(&my_ep_info, ep_info,
|
||||
sizeof(spml_ikrit_mxm_ep_conn_info_t));
|
||||
|
||||
/* Get the EP connection requests for all the processes from modex */
|
||||
|
@ -346,74 +346,6 @@ oshmem_proc_t * oshmem_proc_find(const orte_process_name_t * name)
|
||||
return rproc;
|
||||
}
|
||||
|
||||
int oshmem_proc_refresh(void)
|
||||
{
|
||||
oshmem_proc_t *proc = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
orte_vpid_t i = 0;
|
||||
int hostname_length = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&oshmem_proc_lock);
|
||||
|
||||
for (item = opal_list_get_first(&oshmem_proc_list), i = 0;
|
||||
item != opal_list_get_end(&oshmem_proc_list);
|
||||
item = opal_list_get_next(item), ++i) {
|
||||
proc = (oshmem_proc_t*) item;
|
||||
|
||||
/* Does not change: proc->proc_name.vpid */
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
|
||||
/* Make sure to clear the local flag before we set it below */
|
||||
proc->proc_flags = 0;
|
||||
|
||||
proc->proc_arch = opal_local_arch;
|
||||
oshmem_shmem_exchange_bcast(&proc->proc_arch,
|
||||
sizeof(uint32_t),
|
||||
i);
|
||||
|
||||
hostname_length = strlen(orte_process_info.nodename);
|
||||
oshmem_shmem_exchange_bcast(&hostname_length,
|
||||
sizeof(int),
|
||||
i);
|
||||
|
||||
if (proc->proc_hostname)
|
||||
free(proc->proc_hostname);
|
||||
|
||||
proc->proc_hostname = (
|
||||
i == ORTE_PROC_MY_NAME->vpid ?
|
||||
strdup(orte_process_info.nodename) :
|
||||
(char *) malloc(hostname_length));
|
||||
oshmem_shmem_exchange_bcast(proc->proc_hostname,
|
||||
hostname_length,
|
||||
i);
|
||||
|
||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||
oshmem_proc_local_proc = proc;
|
||||
} else {
|
||||
/* if arch is different than mine, create a new convertor for this proc */
|
||||
if (proc->proc_arch != opal_local_arch) {
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
OBJ_RELEASE(proc->proc_convertor);
|
||||
proc->proc_convertor = opal_convertor_create(proc->proc_arch, 0);
|
||||
#else
|
||||
orte_show_help("help-shmem-runtime.txt",
|
||||
"heterogeneous-support-unavailable",
|
||||
true,
|
||||
orte_process_info.nodename,
|
||||
proc->proc_hostname == NULL ?
|
||||
"<hostname unavailable>" :
|
||||
proc->proc_hostname);
|
||||
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
|
||||
return OSHMEM_ERR_NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
|
||||
|
||||
return OSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
int oshmem_proc_pack(oshmem_proc_t **proclist,
|
||||
int proclistsize,
|
||||
|
@ -310,21 +310,6 @@ OSHMEM_DECLSPEC int oshmem_proc_unpack(opal_buffer_t *buf,
|
||||
int *newproclistsize,
|
||||
oshmem_proc_t ***newproclist);
|
||||
|
||||
/**
|
||||
* Refresh the OSHMEM process subsystem
|
||||
*
|
||||
* Refresh the Open SHMEM process subsystem. This function will update
|
||||
* the list of proc instances in the current pe set with
|
||||
* data from the run-time environemnt.
|
||||
*
|
||||
* @note This is primarily used when restarting a process and thus
|
||||
* need to update the jobid and node name.
|
||||
*
|
||||
* @retval OSHMEM_SUCESS System successfully refreshed
|
||||
* @retval OSHMEM_ERROR Refresh failed due to unspecified error
|
||||
*/
|
||||
OSHMEM_DECLSPEC int oshmem_proc_refresh(void);
|
||||
|
||||
static inline int oshmem_proc_pe(oshmem_proc_t *proc)
|
||||
{
|
||||
return (proc ? (int) proc->proc_name.vpid : -1);
|
||||
|
@ -10,75 +10,21 @@
|
||||
|
||||
#include "oshmem_config.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "ompi/communicator/communicator.h" /*TODO: ompi_communicator_t */
|
||||
#include "ompi/patterns/comm/coll_ops.h" /*TODO: comm_bcast_pml */
|
||||
|
||||
#include "oshmem/constants.h"
|
||||
#include "oshmem/runtime/runtime.h"
|
||||
#include "oshmem/runtime/params.h"
|
||||
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_exchange_allgather(void *buf,
|
||||
int buf_size)
|
||||
int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size)
|
||||
{
|
||||
int rc = OSHMEM_SUCCESS;
|
||||
int i = 0;
|
||||
int *ranks_in_comm = NULL;
|
||||
int rc;
|
||||
|
||||
ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int));
|
||||
if (NULL == ranks_in_comm) {
|
||||
return OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
rc = MPI_Allgather(send_buf, elem_size, MPI_BYTE,
|
||||
rcv_buf, elem_size, MPI_BYTE, oshmem_comm_world);
|
||||
|
||||
for (i = 0; i < (int) orte_process_info.num_procs; ++i) {
|
||||
ranks_in_comm[i] = i;
|
||||
}
|
||||
void* buf_temp = malloc(buf_size);
|
||||
memcpy(buf_temp, (char*)buf + buf_size * ORTE_PROC_MY_NAME->vpid, buf_size);
|
||||
|
||||
rc = comm_allgather_pml( buf_temp,
|
||||
buf,
|
||||
buf_size,
|
||||
MPI_BYTE,
|
||||
ORTE_PROC_MY_NAME->vpid,
|
||||
orte_process_info.num_procs,
|
||||
ranks_in_comm,
|
||||
(ompi_communicator_t *) &ompi_mpi_comm_world);
|
||||
|
||||
if (ranks_in_comm)
|
||||
free(ranks_in_comm);
|
||||
if (buf_temp)
|
||||
free(buf_temp);
|
||||
return rc;
|
||||
}
|
||||
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_exchange_bcast(void *buf,
|
||||
int buf_size,
|
||||
int peer)
|
||||
void oshmem_shmem_barrier(void)
|
||||
{
|
||||
int rc = OSHMEM_SUCCESS;
|
||||
int i = 0;
|
||||
int *ranks_in_comm = NULL;
|
||||
|
||||
ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int));
|
||||
if (NULL == ranks_in_comm) {
|
||||
return OSHMEM_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0; i < (int) orte_process_info.num_procs; ++i) {
|
||||
ranks_in_comm[i] = i;
|
||||
}
|
||||
rc = comm_bcast_pml((void *) buf,
|
||||
peer,
|
||||
buf_size,
|
||||
MPI_BYTE,
|
||||
ORTE_PROC_MY_NAME->vpid,
|
||||
orte_process_info.num_procs,
|
||||
ranks_in_comm,
|
||||
(ompi_communicator_t *) &ompi_mpi_comm_world);
|
||||
if (ranks_in_comm)
|
||||
free(ranks_in_comm);
|
||||
|
||||
return rc;
|
||||
MPI_Barrier(oshmem_comm_world);
|
||||
}
|
||||
|
@ -121,11 +121,14 @@ int oshmem_shmem_finalize(void);
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode);
|
||||
|
||||
/**
|
||||
* Exchange initial info between processes
|
||||
* Allgather between all PEs
|
||||
*/
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_exchange_allgather(void *buf, int buf_size);
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size);
|
||||
|
||||
OSHMEM_DECLSPEC int oshmem_shmem_exchange_bcast(void *buf, int buf_size, int root);
|
||||
/**
|
||||
* Barrier between all PEs
|
||||
*/
|
||||
OSHMEM_DECLSPEC void oshmem_shmem_barrier(void);
|
||||
|
||||
/**
|
||||
* Register OSHMEM specific runtime parameters
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user