1
1
openmpi/oshmem/mca/scoll/basic/scoll_basic_collect.c
Mike Dubman 982149d8c8 OSHMEM: various fixes
1. fix in oshmem scoll component: basic algorithms should
   call basic collectives since their implementation
   incompatible with others (fca, hcoll).

2. Set OPAL_EVLOOP_ONCE flag ON for libevent in the case 
   of yoda smpl. Otherwise there is possible deadlock in 
   atomic_basic_lock call

fixed by Val, Igor, reviewed by Miked

cmr=v1.7.5:reviewer=ompi-rm1.7

This commit was SVN r30762.
2014-02-18 15:07:03 +00:00

627 строки
22 KiB
C

/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "oshmem_config.h"
#include <stdio.h>
#include <stdlib.h>
#include "orte/mca/grpcomm/grpcomm.h"
#include "oshmem/constants.h"
#include "oshmem/mca/spml/spml.h"
#include "oshmem/mca/scoll/scoll.h"
#include "oshmem/mca/scoll/base/base.h"
#include "scoll_basic.h"
static int _algorithm_central_collector(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync);
static int _algorithm_f_central_counter(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync);
static int _algorithm_f_tournament(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync);
static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync);
static int _algorithm_f_ring(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync);
int mca_scoll_basic_collect(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync,
bool nlong_type,
int alg)
{
int rc = OSHMEM_SUCCESS;
/* Arguments validation */
if (!group) {
SCOLL_ERROR("Active set (group) of PE is not defined");
rc = OSHMEM_ERR_BAD_PARAM;
}
/* Check if this PE is part of the group */
if ((rc == OSHMEM_SUCCESS) && oshmem_proc_group_is_member(group)) {
int i = 0;
if (nlong_type) {
alg = (alg == SCOLL_DEFAULT_ALG ?
mca_scoll_basic_param_collect_algorithm : alg);
switch (alg) {
case SCOLL_ALG_COLLECT_CENTRAL_COUNTER:
{
rc = _algorithm_f_central_counter(group,
target,
source,
nlong,
pSync);
break;
}
case SCOLL_ALG_COLLECT_TOURNAMENT:
{
rc = _algorithm_f_tournament(group,
target,
source,
nlong,
pSync);
break;
}
case SCOLL_ALG_COLLECT_RECURSIVE_DOUBLING:
{
rc = _algorithm_f_recursive_doubling(group,
target,
source,
nlong,
pSync);
break;
}
case SCOLL_ALG_COLLECT_RING:
{
rc = _algorithm_f_ring(group,
target,
source,
nlong,
pSync);
break;
}
default:
{
rc = _algorithm_f_central_counter(group,
target,
source,
nlong,
pSync);
}
}
} else {
rc = _algorithm_central_collector(group,
target,
source,
nlong,
pSync);
}
/* Restore initial values */
SCOLL_VERBOSE(12,
"[#%d] Restore special synchronization array",
group->my_pe);
for (i = 0; pSync && (i < _SHMEM_COLLECT_SYNC_SIZE); i++) {
pSync[i] = _SHMEM_SYNC_VALUE;
}
}
return rc;
}
/*
This algorithm is quite simple and straightforward for PEs with identical data size.
One node gathers data from peers and send final result to them.
Outlay:
NP-1 competing network transfers are needed.
*/
static int _algorithm_f_central_counter(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync)
{
int rc = OSHMEM_SUCCESS;
int i = 0;
int PE_root = oshmem_proc_pe(group->proc_array[0]);
SCOLL_VERBOSE(12,
"[#%d] Collect algorithm: Central Counter (identical size)",
group->my_pe);
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
if (PE_root == group->my_pe) {
int pe_cur = 0;
memcpy((void*) ((unsigned char*) target + 0 * nlong),
(void *) source,
nlong);
SCOLL_VERBOSE(14,
"[#%d] Gather data from all PEs in the group",
group->my_pe);
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
/* Get PE ID of a peer from the group */
pe_cur = oshmem_proc_pe(group->proc_array[i]);
if (pe_cur == group->my_pe)
continue;
SCOLL_VERBOSE(14,
"[#%d] Gather data (%d bytes) from #%d",
group->my_pe, (int)nlong, pe_cur);
/* Get data from the current peer */
rc = MCA_SPML_CALL(get((void *)source, nlong, (void*)((unsigned char*)target + i * nlong), pe_cur));
}
}
/* Send result to all PE in group */
if (rc == OSHMEM_SUCCESS) {
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
return rc;
}
static int _algorithm_f_tournament(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync)
{
int rc = OSHMEM_SUCCESS;
int round = 0;
int exit_flag = group->proc_count - 1;
long value = SHMEM_SYNC_INIT;
int my_id = oshmem_proc_group_find_id(group, group->my_pe);
int peer_id = 0;
int peer_pe = 0;
int PE_root = oshmem_proc_pe(group->proc_array[0]);
SCOLL_VERBOSE(12,
"[#%d] Collect algorithm: Tournament (identical size)",
group->my_pe);
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
/* Set current state as WAIT */
pSync[0] = SHMEM_SYNC_WAIT;
/* Copy data to itself */
memcpy((void*) ((unsigned char*) target + my_id * nlong),
(void *) source,
nlong);
while (exit_flag && (rc == OSHMEM_SUCCESS)) {
/* Define a peer for competition */
peer_id = my_id ^ (1 << round);
/* Update exit condition and round counter */
exit_flag >>= 1;
round++;
/* Do not have peer for tournament */
if (peer_id >= group->proc_count)
continue;
if (my_id < peer_id) {
pSync[0] = peer_id;
value = my_id;
SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
} else {
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
#if 1 /* It is ugly implementation of compare and swap operation
Usage of this hack does not give performance improvement but
it is expected that shmem_long_cswap() will make it faster.
*/
do {
MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe));
} while (value != my_id);
SCOLL_VERBOSE(14,
"[#%d] round = %d send data to #%d",
group->my_pe, round, peer_pe);
rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + my_id * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + my_id * nlong), peer_pe));
MCA_SPML_CALL(fence());
SCOLL_VERBOSE(14,
"[#%d] round = %d signals to #%d",
group->my_pe, round, peer_pe);
value = peer_id;
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
#endif
SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
break;
}
}
/* Send result to all PE in group */
if ((my_id == 0) && (rc == OSHMEM_SUCCESS)) {
SCOLL_VERBOSE(14, "[#%d] signals to all", group->my_pe);
value = SHMEM_SYNC_RUN;
for (peer_id = 1;
(peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS);
peer_id++) {
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
}
}
/* Send result to all PE in group */
if (rc == OSHMEM_SUCCESS) {
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
group->proc_count * nlong,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
return rc;
}
static int _algorithm_f_ring(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync)
{
int rc = OSHMEM_SUCCESS;
int i = 0;
long value = SHMEM_SYNC_INIT;
int my_id = oshmem_proc_group_find_id(group, group->my_pe);
int data_index = 0;
int peer_id = 0;
int peer_pe = 0;
SCOLL_VERBOSE(12,
"[#%d] Collect algorithm: Ring (identical size)",
group->my_pe);
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
peer_id = (my_id + 1) % group->proc_count;
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
memcpy((void*) ((unsigned char*) target + my_id * nlong),
(void *) source,
nlong);
data_index = my_id;
for (i = 0; (i < (group->proc_count - 1)) && (rc == OSHMEM_SUCCESS); i++) {
SCOLL_VERBOSE(14,
"[#%d] round = %d send data to #%d by index = %d",
group->my_pe, i, peer_pe, data_index);
rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe));
MCA_SPML_CALL(fence());
SCOLL_VERBOSE(14,
"[#%d] round = %d signals to #%d",
group->my_pe, i, peer_pe);
value = i;
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
data_index = (data_index ? (data_index - 1) : (group->proc_count - 1));
SCOLL_VERBOSE(14,
"[#%d] round = %d wait for data by index = %d",
group->my_pe, i, data_index);
if (i == 0) {
value = _SHMEM_SYNC_VALUE;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG));
} else {
value = i;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_GE, (void*)&value, SHMEM_LONG));
}
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
return rc;
}
static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync)
{
int rc = OSHMEM_SUCCESS;
int round = 0;
int floor2_proc = 0;
int exit_flag = 0;
long value = SHMEM_SYNC_INIT;
int my_id = oshmem_proc_group_find_id(group, group->my_pe);
int data_index = 0;
int peer_id = 0;
int peer_pe = 0;
int i = 0;
floor2_proc = 1;
i = group->proc_count;
i >>= 1;
while (i) {
i >>= 1;
floor2_proc <<= 1;
}
SCOLL_VERBOSE(12,
"[#%d] Collect algorithm: Recursive Doubling (identical size)",
group->my_pe);
SCOLL_VERBOSE(15,
"[#%d] pSync[0] = %ld floor2_proc = %d",
group->my_pe, pSync[0], floor2_proc);
memcpy((void*) ((unsigned char*) target + my_id * nlong),
(void *) source,
nlong);
data_index = my_id;
if (my_id >= floor2_proc) {
int pe_cur = 0;
/* I am in extra group, my partner is node (my_id-y) in basic group */
peer_id = my_id - floor2_proc;
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
if (i == my_id)
continue;
pe_cur = oshmem_proc_pe(group->proc_array[i]);
SCOLL_VERBOSE(14,
"[#%d] is extra send data to #%d",
group->my_pe, pe_cur);
rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), nlong, (void *)source, pe_cur));
}
MCA_SPML_CALL(fence());
SCOLL_VERBOSE(14,
"[#%d] is extra and signal to #%d",
group->my_pe, peer_pe);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
} else {
/* Wait for a peer from extra group */
if ((group->proc_count - floor2_proc) > my_id) {
/* I am in basic group, my partner is node (my_id+y) in extra group */
peer_id = my_id + floor2_proc;
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
SCOLL_VERBOSE(14,
"[#%d] wait a signal from #%d",
group->my_pe, peer_pe);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
}
/* Pairwise exchange */
exit_flag = floor2_proc - 1;
pSync[0] = round;
while (exit_flag && (rc == OSHMEM_SUCCESS)) {
/* Define a peer for competition */
peer_id = my_id ^ (1 << round);
/* Update exit condition and round counter */
exit_flag >>= 1;
round++;
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
#if 1 /* It is ugly implementation of compare and swap operation
Usage of this hack does not give performance improvement but
it is expected that shmem_long_cswap() will make it faster.
*/
do {
MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, peer_pe));
} while (value != (round - 1));
SCOLL_VERBOSE(14,
"[#%d] round = %d send data to #%d by index = %d",
group->my_pe, round, peer_pe, data_index);
rc = MCA_SPML_CALL(put((void*)((unsigned char*)target + data_index * nlong), (1 << (round - 1)) * nlong, (void*)((unsigned char*)target + data_index * nlong), peer_pe));
MCA_SPML_CALL(fence());
data_index = (my_id / (1 << round)) * (1 << round);
SCOLL_VERBOSE(14,
"[#%d] round = %d signals to #%d",
group->my_pe, round, peer_pe);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
#endif
SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
pSync[0] = round;
}
/* Notify a peer from extra group */
if ((group->proc_count - floor2_proc) > my_id) {
/* I am in basic group, my partner is node (my_id+y) in extra group */
peer_id = my_id + floor2_proc;
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
SCOLL_VERBOSE(14,
"[#%d] is extra send data to #%d",
group->my_pe, peer_pe);
rc = MCA_SPML_CALL(put(target, group->proc_count * nlong, target, peer_pe));
MCA_SPML_CALL(fence());
SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe);
value = SHMEM_SYNC_RUN;
rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe));
}
}
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
return rc;
}
/*
This algorithm is quite simple and straightforward. It allows to have different data size on PEs.
One node gathers data from peers and send final result to them.
Outlay:
NP-1 competing network transfers are needed.
*/
static int _algorithm_central_collector(struct oshmem_group_t *group,
void *target,
const void *source,
size_t nlong,
long *pSync)
{
int rc = OSHMEM_SUCCESS;
size_t offset = 0;
int i = 0;
int PE_root = oshmem_proc_pe(group->proc_array[0]);
SCOLL_VERBOSE(12,
"[#%d] Collect algorithm: Central Counter (vary size)",
group->my_pe);
/* Set own data size */
pSync[0] = nlong;
if (PE_root == group->my_pe) {
long value = 0;
int pe_cur = 0;
long wait_pe_count = 0;
size_t* wait_pe_array = NULL;
wait_pe_count = group->proc_count;
wait_pe_array = malloc(sizeof(*wait_pe_array) * wait_pe_count);
if (wait_pe_array) {
memset((void*) wait_pe_array,
0,
sizeof(*wait_pe_array) * wait_pe_count);
wait_pe_array[0] = nlong;
wait_pe_count--;
while (wait_pe_count) {
SCOLL_VERBOSE(14,
"[#%d] Gather data size info from all PEs in the group",
group->my_pe);
for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS);
i++) {
if (wait_pe_array[i] == 0) {
pe_cur = oshmem_proc_pe(group->proc_array[i]);
value = 0;
rc = MCA_SPML_CALL(get((void*)pSync, sizeof(value), (void*)&value, pe_cur));
if ((rc == OSHMEM_SUCCESS)
&& (value != _SHMEM_SYNC_VALUE)
&& (value > 0)) {
wait_pe_array[i] = (size_t) value;
wait_pe_count--;
SCOLL_VERBOSE(14,
"Got source data size as %d from #%d (wait list counter: %d)",
(int)value, pe_cur, (int)wait_pe_count);
}
}
}
}
memcpy((void*) ((unsigned char*) target + 0 * nlong),
(void *) source,
nlong);
offset += nlong;
for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS);
i++) {
/* Get PE ID of a peer from the group */
pe_cur = oshmem_proc_pe(group->proc_array[i]);
/* Get data from the current peer */
rc = MCA_SPML_CALL(get((void *)source, wait_pe_array[i], (void*)((unsigned char*)target + offset), pe_cur));
SCOLL_VERBOSE(14,
"Got %d bytes of data from #%d (offset: %d)",
(int)wait_pe_array[i], pe_cur, (int)offset);
offset += wait_pe_array[i];
}
free(wait_pe_array);
} else {
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
}
}
/* Send result to all PE in group */
if (rc == OSHMEM_SUCCESS) {
SCOLL_VERBOSE(14,
"[#%d] Broadcast from the root #%d",
group->my_pe, PE_root);
rc = BCAST_FUNC(group,
PE_root,
target,
target,
offset,
(pSync + 1),
SCOLL_DEFAULT_ALG);
}
return rc;
}