1
1

grpcomm: fix bugs in grpcomm algorithms

This commit fixes multiple issues in the bruck's and recursive
doubling grpcomm algorithms. The following changes are included:

 - Use the existing bitmap implementation instead of implementing a
   new one. There were bugs in the implementation that caused an
   overrun of the bitmap array.

 - Clean up the algorithms to eliminate errors.

 - Send as little extra data as possible in the bruck's
   algorithm.

The changes were testest with various numbers of ortes varying from 1
to 4096.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-01-07 10:12:08 -07:00
родитель 13f9bb9202
Коммит fab1eca536
5 изменённых файлов: 213 добавлений и 159 удалений

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,8 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* All rights reserved. * reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -118,10 +119,10 @@ static void ccon(orte_grpcomm_coll_t *p)
{ {
p->sig = NULL; p->sig = NULL;
OBJ_CONSTRUCT(&p->bucket, opal_buffer_t); OBJ_CONSTRUCT(&p->bucket, opal_buffer_t);
OBJ_CONSTRUCT(&p->distance_mask_recv, opal_bitmap_t);
p->dmns = NULL; p->dmns = NULL;
p->ndmns = 0; p->ndmns = 0;
p->nreported = 0; p->nreported = 0;
p->distance_mask_recv = NULL;
p->cbfunc = NULL; p->cbfunc = NULL;
p->cbdata = NULL; p->cbdata = NULL;
p->buffers = NULL; p->buffers = NULL;
@ -132,13 +133,9 @@ static void cdes(orte_grpcomm_coll_t *p)
OBJ_RELEASE(p->sig); OBJ_RELEASE(p->sig);
} }
OBJ_DESTRUCT(&p->bucket); OBJ_DESTRUCT(&p->bucket);
if (NULL != p->dmns) { OBJ_DESTRUCT(&p->distance_mask_recv);
free(p->dmns); free(p->dmns);
}
free(p->buffers); free(p->buffers);
if (NULL != p->distance_mask_recv) {
free(p->distance_mask_recv);
}
} }
OBJ_CLASS_INSTANCE(orte_grpcomm_coll_t, OBJ_CLASS_INSTANCE(orte_grpcomm_coll_t,
opal_list_item_t, opal_list_item_t,

Просмотреть файл

@ -1,5 +1,5 @@
/* -*- C -*- /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
* /*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
@ -10,8 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* All rights reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -427,27 +427,12 @@ CLEANUP:
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
void orte_grpcomm_base_mark_distance_recv(orte_grpcomm_coll_t *coll, void orte_grpcomm_base_mark_distance_recv (orte_grpcomm_coll_t *coll,
uint32_t distance) { uint32_t distance) {
uint32_t maskNumber = distance / 32; opal_bitmap_set_bit (&coll->distance_mask_recv, distance);
uint32_t bitNumber = distance % 32;
coll->distance_mask_recv[maskNumber] |= (1 << bitNumber);
return;
} }
unsigned int orte_grpcomm_base_check_distance_recv(orte_grpcomm_coll_t *coll, unsigned int orte_grpcomm_base_check_distance_recv (orte_grpcomm_coll_t *coll,
uint32_t distance) { uint32_t distance) {
uint32_t maskNumber = distance / 32; return opal_bitmap_is_set_bit (&coll->distance_mask_recv, distance);
uint32_t bitNumber = distance % 32;
if (NULL == coll->distance_mask_recv) {
return 0;
} else {
if (0 == distance) {
return 1;
}
return (((coll->distance_mask_recv[maskNumber] & (1 << bitNumber)) != 0) ? 1 : 0);
}
} }

Просмотреть файл

@ -1,10 +1,10 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2007 The Trustees of Indiana University. * Copyright (c) 2007 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* rights reserved. * reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc. * Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved. * All rights reserved.
@ -74,8 +74,6 @@ static void finalize(void)
{ {
/* cancel the recv */ /* cancel the recv */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_BRKS); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_BRKS);
return;
} }
static int allgather(orte_grpcomm_coll_t *coll, static int allgather(orte_grpcomm_coll_t *coll,
@ -84,18 +82,37 @@ static int allgather(orte_grpcomm_coll_t *coll,
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:bruck algo employed for %d processes", "%s grpcomm:coll:bruck algo employed for %d processes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns));
/* get my own rank */
coll->my_rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
coll->my_rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == coll->my_rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"Peer not found"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
brks_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* record that we contributed */ /* record that we contributed */
coll->nreported = 1; coll->nreported = 1;
/* mark local data received */ /* mark local data received */
coll->distance_mask_recv = (uint32_t *)calloc(sizeof(uint32_t), (coll->ndmns - 1)); if (coll->ndmns > 1) {
opal_bitmap_init (&coll->distance_mask_recv, (uint32_t) log2 (coll->ndmns) + 1);
}
/* start by seeding the collection with our own data */ /* start by seeding the collection with our own data */
opal_dss.copy_payload(&coll->bucket, sendbuf); opal_dss.copy_payload(&coll->bucket, sendbuf);
/* process data */ /* process data */
brks_allgather_process_data(coll, 1); brks_allgather_process_data (coll, 0);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -118,6 +135,12 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name
OBJ_RELEASE(send_buf); OBJ_RELEASE(send_buf);
return rc; return rc;
} }
/* pack the number of daemons included in the payload */
if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &coll->nreported, 1, OPAL_SIZE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
return rc;
}
/* pack the data */ /* pack the data */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(send_buf, &coll->bucket))) { if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(send_buf, &coll->bucket))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -142,6 +165,43 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int brks_allgather_process_buffered (orte_grpcomm_coll_t *coll, uint32_t distance) {
opal_buffer_t *buffer;
size_t nreceived;
int32_t cnt = 1;
int rc;
/* check whether data for next distance is available*/
if (NULL == coll->buffers || NULL == coll->buffers[distance]) {
return 0;
}
buffer = coll->buffers[distance];
coll->buffers[distance] = NULL;
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks %u distance data found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
rc = opal_dss.unpack (buffer, &nreceived, &cnt, OPAL_SIZE);
if (OPAL_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return rc;
}
coll->nreported += nreceived;
orte_grpcomm_base_mark_distance_recv (coll, distance);
OBJ_RELEASE(buffer);
return 1;
}
static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance) { static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance) {
/* Communication step: /* Communication step:
At every step i, rank r: At every step i, rank r:
@ -149,69 +209,72 @@ static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t dist
- sends message containing all data collected so far to rank r - distance - sends message containing all data collected so far to rank r - distance
- receives message containing all data collected so far from rank (r + distance) - receives message containing all data collected so far from rank (r + distance)
*/ */
uint32_t log2ndmns = (uint32_t) log2 (coll->ndmns);
uint32_t last_round, remainder;
orte_process_name_t peer; orte_process_name_t peer;
orte_vpid_t nv, rank; orte_vpid_t nv;
int rc; int rc;
/* NTH: calculate in which round we should send the final data. this is the first
* round in which we have data from at least (coll->ndmns - (1 << log2ndmns))
* daemons. alternatively we could just send when distance reaches log2ndmns but
* that could end up sending more data than needed */
last_round = (uint32_t) ceil (log2 ((double) (coll->ndmns - (1 << log2ndmns))));
peer.jobid = ORTE_PROC_MY_NAME->jobid; peer.jobid = ORTE_PROC_MY_NAME->jobid;
/* get my own rank */ while (distance < log2ndmns) {
rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"Peer not found"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
brks_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return;
}
while (distance < coll->ndmns) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks process distance %u)", "%s grpcomm:coll:brks process distance %u)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
/* first send my current contents */ /* first send my current contents */
nv = (coll->ndmns + rank - distance) % coll->ndmns; nv = (coll->ndmns + coll->my_rank - (1 << distance)) % coll->ndmns;
peer.vpid = coll->dmns[nv]; peer.vpid = coll->dmns[nv];
brks_allgather_send_dist(coll, &peer, distance); brks_allgather_send_dist(coll, &peer, distance);
/* check whether data for next distance is available*/ if (distance == last_round) {
if ((NULL != coll->buffers) && (coll->buffers[distance - 1] != NULL)) { /* have enough data to send the final round now */
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, nv = (coll->ndmns + coll->my_rank - (1 << log2ndmns)) % coll->ndmns;
"%s grpcomm:coll:brks %u distance data found", peer.vpid = coll->dmns[nv];
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); brks_allgather_send_dist(coll, &peer, log2ndmns);
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, coll->buffers[distance - 1]))) { }
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc); rc = brks_allgather_process_buffered (coll, distance);
if (!rc) {
break;
} else if (rc < 0) {
return; return;
} }
coll->nreported += distance;
orte_grpcomm_base_mark_distance_recv(coll, distance); ++distance;
OBJ_RELEASE(coll->buffers[distance - 1]);
coll->buffers[distance - 1] = NULL;
distance = distance << 1;
continue;
} }
break;
if (distance == log2ndmns) {
if (distance == last_round) {
/* need to send the final round now */
nv = (coll->ndmns + coll->my_rank - (1 << log2ndmns)) % coll->ndmns;
peer.vpid = coll->dmns[nv];
brks_allgather_send_dist(coll, &peer, log2ndmns);
} }
/* check if the final message is already queued */
rc = brks_allgather_process_buffered (coll, distance);
if (rc < 0) {
return;
}
}
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks reported %lu process from %lu", "%s grpcomm:coll:brks reported %lu process from %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported,
(unsigned long)coll->ndmns)); (unsigned long)coll->ndmns));
/* if we are done, then complete things */ /* if we are done, then complete things. we may get data from more daemons than expected */
if (coll->nreported >= coll->ndmns){ if (coll->nreported >= coll->ndmns){
brks_finalize_coll(coll, ORTE_SUCCESS); brks_finalize_coll(coll, ORTE_SUCCESS);
} }
return;
} }
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender, static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
@ -253,28 +316,36 @@ static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
assert(0 == orte_grpcomm_base_check_distance_recv(coll, distance)); assert(0 == orte_grpcomm_base_check_distance_recv(coll, distance));
/* Check whether we can process next distance */ /* Check whether we can process next distance */
if (orte_grpcomm_base_check_distance_recv(coll, (distance >> 1))) { if (coll->nreported && (!distance || orte_grpcomm_base_check_distance_recv(coll, distance - 1))) {
size_t nreceived;
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks data from %d distance received, " "%s grpcomm:coll:brks data from %d distance received, "
"Process the next distance.", "Process the next distance.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
/* capture any provided content */ /* capture any provided content */
rc = opal_dss.unpack (buffer, &nreceived, &cnt, OPAL_SIZE);
if (OPAL_SUCCESS != rc) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) { if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc); brks_finalize_coll(coll, rc);
return; return;
} }
coll->nreported += distance; coll->nreported += nreceived;
orte_grpcomm_base_mark_distance_recv(coll, distance); orte_grpcomm_base_mark_distance_recv(coll, distance);
brks_allgather_process_data(coll, (uint32_t)(distance << 1)); brks_allgather_process_data(coll, distance + 1);
} else { } else {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks data from %d distance received, " "%s grpcomm:coll:brks data from %d distance received, "
"still waiting for data.", "still waiting for data.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
if (NULL == coll->buffers) { if (NULL == coll->buffers) {
if (NULL == (coll->buffers = (opal_buffer_t **)calloc(sizeof(opal_buffer_t *), coll->ndmns - 1))) { if (NULL == (coll->buffers = (opal_buffer_t **) calloc ((uint32_t) log2 (coll->ndmns) + 1, sizeof(opal_buffer_t *)))) {
rc = OPAL_ERR_OUT_OF_RESOURCE; rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -282,14 +353,14 @@ static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
return; return;
} }
} }
if (NULL == (coll->buffers[distance - 1] = OBJ_NEW(opal_buffer_t))) { if (NULL == (coll->buffers[distance] = OBJ_NEW(opal_buffer_t))) {
rc = OPAL_ERR_OUT_OF_RESOURCE; rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc); brks_finalize_coll(coll, rc);
return; return;
} }
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance - 1], buffer))) { if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance], buffer))) {
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc); brks_finalize_coll(coll, rc);
@ -298,8 +369,6 @@ static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
} }
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
return;
} }
static int brks_finalize_coll(orte_grpcomm_coll_t *coll, int ret) static int brks_finalize_coll(orte_grpcomm_coll_t *coll, int ret)

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -42,6 +42,7 @@
#include "orte/mca/mca.h" #include "orte/mca/mca.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "opal/class/opal_bitmap.h"
#include "opal/dss/dss_types.h" #include "opal/dss/dss_types.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
@ -72,11 +73,14 @@ typedef struct {
opal_buffer_t bucket; opal_buffer_t bucket;
/* participating daemons */ /* participating daemons */
orte_vpid_t *dmns; orte_vpid_t *dmns;
/** number of participating daemons */
size_t ndmns; size_t ndmns;
/** my index in the dmns array */
unsigned long my_rank;
/* number reported in */ /* number reported in */
size_t nreported; size_t nreported;
/* distance masks for receive */ /* distance masks for receive */
uint32_t *distance_mask_recv; opal_bitmap_t distance_mask_recv;
/* received buckets */ /* received buckets */
opal_buffer_t ** buffers; opal_buffer_t ** buffers;
/* callback function */ /* callback function */

Просмотреть файл

@ -1,9 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2007 The Trustees of Indiana University. * Copyright (c) 2007 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All
* rights reserved. * rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc. * Copyright (c) 2014 Mellanox Technologies, Inc.
@ -76,35 +76,55 @@ static void finalize(void)
{ {
/* cancel the recv */ /* cancel the recv */
orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_RCD); orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_RCD);
return;
} }
static int allgather(orte_grpcomm_coll_t *coll, static int allgather(orte_grpcomm_coll_t *coll,
opal_buffer_t *sendbuf) opal_buffer_t *sendbuf)
{ {
uint32_t log2ndmns;
/* check the number of involved daemons - if it is not a power of two, /* check the number of involved daemons - if it is not a power of two,
* then we cannot do it */ * then we cannot do it */
if (0 == ((coll->ndmns != 0) && !(coll->ndmns & (coll->ndmns - 1)))) { if (0 == ((coll->ndmns != 0) && !(coll->ndmns & (coll->ndmns - 1)))) {
return ORTE_ERR_TAKE_NEXT_OPTION; return ORTE_ERR_TAKE_NEXT_OPTION;
} }
log2ndmns = log2 (coll->ndmns);
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub algo employed for %d daemons", "%s grpcomm:coll:recdub algo employed for %d daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns));
/* record that we contributed */
coll->nreported = 1;
/* mark local data received */ /* mark local data received */
if (coll->ndmns > 1) { if (log2ndmns) {
coll->distance_mask_recv = (uint32_t *)calloc(sizeof(uint32_t), log2(coll->ndmns)); opal_bitmap_init (&coll->distance_mask_recv, log2ndmns);
}
/* get my own rank */
coll->my_rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0 ; nv < coll->ndmns ; ++nv) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
coll->my_rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == coll->my_rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"My peer not found in daemons array"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rcd_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
} }
/* start by seeding the collection with our own data */ /* start by seeding the collection with our own data */
opal_dss.copy_payload(&coll->bucket, sendbuf); opal_dss.copy_payload(&coll->bucket, sendbuf);
coll->nreported = 1;
/* process data */ /* process data */
rcd_allgather_process_data(coll, 1); rcd_allgather_process_data (coll, 0);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -154,71 +174,54 @@ static void rcd_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t dista
At every step i, rank r: At every step i, rank r:
- exchanges message containing all data collected so far with rank peer = (r ^ 2^i). - exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
*/ */
uint32_t log2ndmns = log2(coll->ndmns);
orte_process_name_t peer; orte_process_name_t peer;
orte_vpid_t nv, rank; orte_vpid_t nv;
uint32_t distance_index;
int rc; int rc;
peer.jobid = ORTE_PROC_MY_NAME->jobid; peer.jobid = ORTE_PROC_MY_NAME->jobid;
/* get my own rank */ while (distance < log2ndmns) {
rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"Peer not found"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rcd_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return;
}
while (distance < coll->ndmns) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub process distance %u", "%s grpcomm:coll:recdub process distance %u",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
/* first send my current contents */ /* first send my current contents */
nv = rank ^ distance; nv = coll->my_rank ^ (1 << distance);
assert (nv < coll->ndmns);
peer.vpid = coll->dmns[nv]; peer.vpid = coll->dmns[nv];
rcd_allgather_send_dist(coll, &peer, distance); rcd_allgather_send_dist(coll, &peer, distance);
/* check whether data for next distance is available*/ /* check whether data for next distance is available */
distance_index = log2(distance); if (NULL == coll->buffers || NULL == coll->buffers[distance]) {
if ((NULL != coll->buffers) && (NULL != coll->buffers[distance_index])) { break;
}
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub %u distance data found", "%s grpcomm:coll:recdub %u distance data found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, coll->buffers[distance_index]))) { if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, coll->buffers[distance]))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, rc);
return; return;
} }
coll->nreported += distance; coll->nreported += 1 << distance;
orte_grpcomm_base_mark_distance_recv(coll, distance); orte_grpcomm_base_mark_distance_recv(coll, distance);
OBJ_RELEASE(coll->buffers[distance_index]); OBJ_RELEASE(coll->buffers[distance]);
coll->buffers[distance_index] = NULL; coll->buffers[distance] = NULL;
distance = distance << 1; ++distance;
continue;
}
break;
} }
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub reported %lu process from %lu", "%s grpcomm:coll:recdub reported %lu process from %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported,
(unsigned long)coll->ndmns)); (unsigned long)coll->ndmns));
/* if we are done, then complete things */ /* if we are done, then complete things */
if (coll->nreported >= coll->ndmns){ if (coll->nreported == coll->ndmns) {
rcd_finalize_coll(coll, ORTE_SUCCESS); rcd_finalize_coll(coll, ORTE_SUCCESS);
} }
return;
} }
static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender, static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
@ -226,7 +229,7 @@ static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
void* cbdata) void* cbdata)
{ {
int32_t cnt; int32_t cnt;
uint32_t distance, distance_index; uint32_t distance;
int rc; int rc;
orte_grpcomm_signature_t *sig; orte_grpcomm_signature_t *sig;
orte_grpcomm_coll_t *coll; orte_grpcomm_coll_t *coll;
@ -250,17 +253,17 @@ static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
return; return;
} }
/* unpack the distance */ /* unpack the distance */
distance = 0; distance = -1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_UINT32))) { if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_UINT32))) {
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, rc);
return; return;
} }
assert(0 == orte_grpcomm_base_check_distance_recv(coll, distance)); assert(distance >= 0 && 0 == orte_grpcomm_base_check_distance_recv(coll, distance));
/* Check whether we can process next distance */ /* Check whether we can process next distance */
if (orte_grpcomm_base_check_distance_recv(coll, (distance >> 1))) { if (coll->nreported && (!distance || orte_grpcomm_base_check_distance_recv(coll, (distance - 1)))) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub data from %d distance received, " "%s grpcomm:coll:recdub data from %d distance received, "
"Process the next distance.", "Process the next distance.",
@ -272,32 +275,30 @@ static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, rc);
return; return;
} }
coll->nreported += distance; coll->nreported += (1 << distance);
orte_grpcomm_base_mark_distance_recv(coll, distance); orte_grpcomm_base_mark_distance_recv (coll, distance);
rcd_allgather_process_data(coll, (uint32_t)(distance << 1)); rcd_allgather_process_data (coll, distance + 1);
} else { } else {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub data from %d distance received, " "%s grpcomm:coll:recdub data from %d distance received, "
"still waiting for data.", "still waiting for data.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
if (NULL == coll->buffers) { if (NULL == coll->buffers) {
if (NULL == (coll->buffers = (opal_buffer_t **)calloc(sizeof(opal_buffer_t *), log2(coll->ndmns)))) { coll->buffers = (opal_buffer_t **) calloc (log2 (coll->ndmns), sizeof (coll->buffers[0]));
rc = OPAL_ERR_OUT_OF_RESOURCE; if (NULL == coll->buffers) {
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, OPAL_ERR_OUT_OF_RESOURCE);
return; return;
} }
} }
distance_index = log2(distance); if (NULL == (coll->buffers[distance] = OBJ_NEW(opal_buffer_t))) {
if (NULL == (coll->buffers[distance_index] = OBJ_NEW(opal_buffer_t))) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE);
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, OPAL_ERR_OUT_OF_RESOURCE);
return; return;
} }
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance_index], buffer))) { if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance], buffer))) {
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc); rcd_finalize_coll(coll, rc);
@ -306,8 +307,6 @@ static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
} }
OBJ_RELEASE(sig); OBJ_RELEASE(sig);
return;
} }
static int rcd_finalize_coll(orte_grpcomm_coll_t *coll, int ret) static int rcd_finalize_coll(orte_grpcomm_coll_t *coll, int ret)