1
1

fixes for grpcomm rcd/brucks algorithms

Этот коммит содержится в:
Elena 2014-10-09 06:12:26 +02:00
родитель 9947758d98
Коммит e319c95267
11 изменённых файлов: 606 добавлений и 436 удалений

Просмотреть файл

@ -30,6 +30,7 @@
#include "orte_config.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/dss/dss_types.h"
#include "opal/mca/mca.h"
#include "opal/mca/hwloc/hwloc.h"
@ -67,6 +68,7 @@ OBJ_CLASS_DECLARATION(orte_grpcomm_base_active_t);
typedef struct {
opal_list_t actives;
opal_list_t ongoing;
opal_hash_table_t sig_table;
} orte_grpcomm_base_t;
ORTE_DECLSPEC extern orte_grpcomm_base_t orte_grpcomm_base;

Просмотреть файл

@ -70,6 +70,7 @@ static int orte_grpcomm_base_close(void)
}
OPAL_LIST_DESTRUCT(&orte_grpcomm_base.actives);
OPAL_LIST_DESTRUCT(&orte_grpcomm_base.ongoing);
OBJ_DESTRUCT(&orte_grpcomm_base.sig_table);
return mca_base_framework_components_close(&orte_grpcomm_base_framework, NULL);
}
@ -82,6 +83,8 @@ static int orte_grpcomm_base_open(mca_base_open_flag_t flags)
{
OBJ_CONSTRUCT(&orte_grpcomm_base.actives, opal_list_t);
OBJ_CONSTRUCT(&orte_grpcomm_base.ongoing, opal_list_t);
OBJ_CONSTRUCT(&orte_grpcomm_base.sig_table, opal_hash_table_t);
opal_hash_table_init(&orte_grpcomm_base.sig_table, 128);
return mca_base_framework_components_open(&orte_grpcomm_base_framework, flags);
}
@ -97,6 +100,7 @@ static void scon(orte_grpcomm_signature_t *p)
{
p->signature = NULL;
p->sz = 0;
p->seq_num = 0;
}
static void sdes(orte_grpcomm_signature_t *p)
{
@ -115,8 +119,10 @@ static void ccon(orte_grpcomm_coll_t *p)
p->dmns = NULL;
p->ndmns = 0;
p->nreported = 0;
p->distance_mask_recv = 0;
p->cbfunc = NULL;
p->cbdata = NULL;
p->buffers = NULL;
}
static void cdes(orte_grpcomm_coll_t *p)
{
@ -127,6 +133,7 @@ static void cdes(orte_grpcomm_coll_t *p)
if (NULL != p->dmns) {
free(p->dmns);
}
free(p->buffers);
}
OBJ_CLASS_INSTANCE(orte_grpcomm_coll_t,
opal_list_item_t,

Просмотреть файл

@ -128,9 +128,11 @@ int orte_grpcomm_API_xcast(orte_grpcomm_signature_t *sig,
static void allgather_stub(int fd, short args, void *cbdata)
{
orte_grpcomm_caddy_t *cd = (orte_grpcomm_caddy_t*)cbdata;
int ret = OPAL_SUCCESS;
int rc;
orte_grpcomm_base_active_t *active;
orte_grpcomm_coll_t *coll;
void *seq_number;
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:base:allgather stub",
@ -139,6 +141,28 @@ static void allgather_stub(int fd, short args, void *cbdata)
/* retrieve an existing tracker, create it if not
* already found. The allgather module is responsible
* for releasing it upon completion of the collective */
ret = opal_hash_table_get_value_ptr(&orte_grpcomm_base.sig_table, (void *)cd->sig->signature, cd->sig->sz * sizeof(orte_process_name_t), &seq_number);
if (OPAL_ERR_NOT_FOUND == ret) {
cd->sig->seq_num = 0;
} else if (OPAL_SUCCESS == ret) {
cd->sig->seq_num = *((uint32_t *)(seq_number)) + 1;
} else {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"%s rpcomm:base:allgather can't not get signature from hash table",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cd);
return;
}
ret = opal_hash_table_set_value_ptr(&orte_grpcomm_base.sig_table, (void *)cd->sig->signature, cd->sig->sz * sizeof(orte_process_name_t), (void *)&cd->sig->seq_num);
if (OPAL_SUCCESS != ret) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"%s rpcomm:base:allgather can't not add new signature to hash table",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(cd);
return;
}
coll = orte_grpcomm_base_get_tracker(cd->sig, true);
coll->cbfunc = cd->cbfunc;
coll->cbdata = cd->cbdata;
@ -169,9 +193,8 @@ int orte_grpcomm_API_allgather(orte_grpcomm_signature_t *sig,
* access framework-global data safely */
cd = OBJ_NEW(orte_grpcomm_caddy_t);
/* ensure the data doesn't go away */
OBJ_RETAIN(sig);
OBJ_RETAIN(buf);
cd->sig = sig;
opal_dss.copy((void **)&cd->sig, (void *)sig, ORTE_SIGNATURE);
cd->buf = buf;
cd->cbfunc = cbfunc;
cd->cbdata = cbdata;
@ -197,7 +220,7 @@ orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig
/* if only one is NULL, then we can't possibly match */
break;
}
if (OPAL_EQUAL == opal_dss.compare(sig, coll->sig, ORTE_SIGNATURE)) {
if (OPAL_EQUAL == (rc = opal_dss.compare(sig, coll->sig, ORTE_SIGNATURE))) {
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:base:returning existing collective",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -213,16 +236,17 @@ orte_grpcomm_coll_t* orte_grpcomm_base_get_tracker(orte_grpcomm_signature_t *sig
return NULL;
}
coll = OBJ_NEW(orte_grpcomm_coll_t);
opal_dss.copy((void **)&coll->sig, (void *)sig, ORTE_SIGNATURE);
if (1 < opal_output_get_verbosity(orte_grpcomm_base_framework.framework_output)) {
char *tmp=NULL;
(void)opal_dss.print(&tmp, NULL, sig, ORTE_SIGNATURE);
(void)opal_dss.print(&tmp, NULL, coll->sig, ORTE_SIGNATURE);
opal_output(0, "%s grpcomm:base: creating new coll for procs %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
free(tmp);
}
coll = OBJ_NEW(orte_grpcomm_coll_t);
OBJ_RETAIN(sig);
coll->sig = sig;
opal_list_append(&orte_grpcomm_base.ongoing, &coll->super);
/* now get the daemons involved */

Просмотреть файл

@ -6,6 +6,8 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -39,7 +41,8 @@ static int xcast(orte_vpid_t *vpids,
opal_buffer_t *msg);
static int allgather(orte_grpcomm_coll_t *coll,
opal_buffer_t *buf);
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance);
static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance);
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name_t *peer, uint32_t distance);
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
@ -91,65 +94,28 @@ static int allgather(orte_grpcomm_coll_t *coll,
"%s grpcomm:coll:bruck algo employed for %d processes",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns));
/* if we only have one proc participating, just copy the data across and return */
if ((coll->ndmns != 0) && ((coll->ndmns & (coll->ndmns - 1)) == 0)) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:bruck number of participating daemons (%d) is power 2",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) coll->ndmns ));
return ORTE_ERROR;
}
/* start by seeding the collection with our own data */
opal_dss.copy_payload(&coll->bucket, sendbuf);
/* record that we contributed */
coll->nreported += 1;
/* mark local data received */
coll->distance_mask_recv |= 1;
/* start by seeding the collection with our own data */
opal_dss.copy_payload(&coll->bucket, sendbuf);
/* Communication step:
At every step i, rank r:
- doubles the distance
- sends message containing all data collected so far to rank r - distance
- receives message containing all data collected so far from rank (r + distance)
*/
/* find my position in the group of participants. This
* value is the "rank" we will use in the algo
*/
brks_allgather_send_dist(coll, 1);
/* process data */
brks_allgather_process_data(coll, 1);
return ORTE_SUCCESS;
}
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance) {
orte_process_name_t peer_send, peer_recv;
static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name_t *peer, uint32_t distance) {
opal_buffer_t *send_buf;
int rc;
peer_send.jobid = ORTE_PROC_MY_NAME->jobid;
peer_recv.jobid = ORTE_PROC_MY_NAME->jobid;
if (1 == coll->ndmns) {
peer_send.vpid = ORTE_PROC_MY_NAME->vpid;
peer_recv.vpid = ORTE_PROC_MY_NAME->vpid;
} else {
orte_vpid_t nv, rank;
rank = ORTE_VPID_INVALID;
for (nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* first send my current contents */
nv = (coll->ndmns + rank - distance) % coll->ndmns;
peer_send.vpid = coll->dmns[nv];
/* now setup to recv from my other partner */
nv = (rank + distance) % coll->ndmns;
peer_recv.vpid = coll->dmns[nv];
}
send_buf = OBJ_NEW(opal_buffer_t);
/* pack the signature */
@ -164,12 +130,6 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t dista
OBJ_RELEASE(send_buf);
return rc;
}
/* pack the number of reported processes */
if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &coll->nreported, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
return rc;
}
/* pack the data */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(send_buf, &coll->bucket))) {
ORTE_ERROR_LOG(rc);
@ -178,11 +138,12 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t dista
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:bruck sending to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer_send)));
"%s grpcomm:coll:brks SENDING TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer)));
if (0 > (rc = orte_rml.send_buffer_nb(&peer_send, send_buf,
if (0 > (rc = orte_rml.send_buffer_nb(peer, send_buf,
ORTE_RML_TAG_ALLGATHER_BRKS,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
@ -190,27 +151,95 @@ static int brks_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t dista
return rc;
};
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:bruck receiving from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer_recv)));
return ORTE_SUCCESS;
}
static void brks_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance) {
/* Communication step:
At every step i, rank r:
- doubles the distance
- sends message containing all data collected so far to rank r - distance
- receives message containing all data collected so far from rank (r + distance)
*/
orte_process_name_t peer;
orte_vpid_t nv, rank;
int rc;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
/* get my own rank */
rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"Peer not found"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
brks_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return;
}
while (distance < coll->ndmns) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks process distance %u (mask recv: 0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
/* first send my current contents */
nv = (coll->ndmns + rank - distance) % coll->ndmns;
peer.vpid = coll->dmns[nv];
brks_allgather_send_dist(coll, &peer, distance);
/* check whether data for next distance is available*/
if ((NULL != coll->buffers) && (coll->buffers[distance - 1] != NULL)) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks %u distance data found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, coll->buffers[distance - 1]))) {
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
coll->nreported += distance;
coll->distance_mask_recv |= (uint32_t)(1 << distance);
OBJ_RELEASE(coll->buffers[distance - 1]);
coll->buffers[distance - 1] = NULL;
distance = distance << 1;
continue;
}
break;
}
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks reported %lu process from %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported,
(unsigned long)coll->ndmns));
/* if we are done, then complete things */
if (coll->nreported >= coll->ndmns){
brks_finalize_coll(coll, ORTE_SUCCESS);
}
return;
}
static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int32_t cnt, num_remote;
int32_t cnt;
int rc;
orte_grpcomm_signature_t *sig;
orte_grpcomm_coll_t *coll;
orte_vpid_t distance, new_distance;
uint32_t distance;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub received data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
"%s grpcomm:coll:brks RECEIVING FROM %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* unpack the signature */
cnt = 1;
@ -225,7 +254,6 @@ static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
OBJ_RELEASE(sig);
return;
}
/* unpack the distance */
distance = 1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) {
@ -234,31 +262,51 @@ static void brks_allgather_recv_dist(int status, orte_process_name_t* sender,
brks_finalize_coll(coll, rc);
return;
}
assert(0 == (coll->distance_mask_recv & (uint32_t)(1 << distance)));
/* unpack number of reported processes */
num_remote = 0;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
coll->nreported += num_remote;
/* capture any provided content */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
//update distance and send
new_distance = distance <<= 1;
if (new_distance < coll->ndmns) {
brks_allgather_send_dist(coll, new_distance);
/* Check whether we can process next distance */
if (coll->distance_mask_recv & ((uint32_t)(1 << (distance >> 1)))) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks data from %d distance received, "
"Process the next distance (mask recv: 0x%x).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
/* capture any provided content */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
coll->nreported += distance;
coll->distance_mask_recv |= (uint32_t)(1 << distance);
brks_allgather_process_data(coll, (uint32_t)(distance << 1));
} else {
brks_finalize_coll(coll, ORTE_SUCCESS);
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks data from %d distance received, "
"still waiting for data (mask recv: 0x%x).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
if (NULL == coll->buffers) {
if (NULL == (coll->buffers = (opal_buffer_t **)calloc(sizeof(opal_buffer_t *), coll->ndmns - 1))) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
}
if (NULL == (coll->buffers[distance - 1] = OBJ_NEW(opal_buffer_t))) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance - 1], buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
brks_finalize_coll(coll, rc);
return;
}
}
OBJ_RELEASE(sig);
@ -270,6 +318,10 @@ static int brks_finalize_coll(orte_grpcomm_coll_t *coll, int ret) {
opal_buffer_t *reply;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:brks declared collective complete",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
reply = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &coll->nreported, 1, OPAL_UINT64))) {

Просмотреть файл

@ -57,6 +57,7 @@ typedef struct {
opal_object_t super;
orte_process_name_t *signature;
size_t sz;
uint32_t seq_num;
} orte_grpcomm_signature_t;
OBJ_CLASS_DECLARATION(orte_grpcomm_signature_t);
@ -73,6 +74,10 @@ typedef struct {
size_t ndmns;
/* number reported in */
size_t nreported;
/* distance masks for receive */
uint32_t distance_mask_recv;
/* received buckets */
opal_buffer_t ** buffers;
/* callback function */
orte_grpcomm_cbfunc_t cbfunc;
/* user-provided callback data */

Просмотреть файл

@ -6,6 +6,8 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -18,6 +20,7 @@
#include "orte/types.h"
#include "orte/runtime/orte_wait.h"
#include <math.h>
#include <string.h>
#include "opal/dss/dss.h"
@ -39,11 +42,13 @@ static int xcast(orte_vpid_t *vpids,
opal_buffer_t *msg);
static int allgather(orte_grpcomm_coll_t *coll,
opal_buffer_t *buf);
static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance);
static void rcd_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance);
static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name_t *peer, uint32_t distance);
static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata);
static int rcd_finalize_coll(orte_grpcomm_coll_t *coll, int ret);
/* Module def */
orte_grpcomm_base_module_t orte_grpcomm_rcd_module = {
init,
@ -85,58 +90,35 @@ static int xcast(orte_vpid_t *vpids,
static int allgather(orte_grpcomm_coll_t *coll,
opal_buffer_t *sendbuf)
{
/* check the number of involved daemons - if it is not a power of two,
* then we cannot do it */
if (0 == ((coll->ndmns != 0) && !(coll->ndmns & (coll->ndmns - 1)))) {
return ORTE_ERR_TAKE_NEXT_OPTION;
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub algo employed for %d processes",
"%s grpcomm:coll:recdub algo employed for %d daemons",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns));
/* if we only have one proc participating, just copy the data across and return */
if (!((coll->ndmns != 0) && ((coll->ndmns & (coll->ndmns - 1)) == 0))) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub number of participating daemons (%d) is not power 2",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns ));
return ORTE_ERROR;
}
/* record that we contributed */
coll->nreported += 1;
/* mark local data received */
coll->distance_mask_recv |= 1;
/* start by seeding the collection with our own data */
opal_dss.copy_payload(&coll->bucket, sendbuf);
/* Communication step:
At every step i, rank r:
- exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
*/
rcd_allgather_send_dist(coll, 1);
/* process data */
rcd_allgather_process_data(coll, 1);
return ORTE_SUCCESS;
}
static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distance) {
orte_process_name_t peer;
static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_process_name_t *peer, uint32_t distance) {
opal_buffer_t *send_buf;
int rc;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
if (1 == coll->ndmns) {
peer.vpid = ORTE_PROC_MY_NAME->vpid;
} else {
orte_vpid_t nv, rank;
rank = ORTE_VPID_INVALID;
for (nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* first send my current contents */
nv = rank ^ distance;
peer.vpid = coll->dmns[nv];
}
send_buf = OBJ_NEW(opal_buffer_t);
/* pack the signature */
@ -145,14 +127,8 @@ static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distan
OBJ_RELEASE(send_buf);
return rc;
}
/* pack the current distance */
if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &distance, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
return rc;
}
/* pack the number of reported processes */
if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &coll->nreported, 1, OPAL_INT32))) {
/* pack the distance */
if (OPAL_SUCCESS != (rc = opal_dss.pack(send_buf, &distance, 1, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
return rc;
@ -165,40 +141,106 @@ static int rcd_allgather_send_dist(orte_grpcomm_coll_t *coll, orte_vpid_t distan
}
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub sending to %s",
"%s grpcomm:coll:recdub SENDING TO %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer)));
ORTE_NAME_PRINT(peer)));
if (0 > (rc = orte_rml.send_buffer_nb(&peer, send_buf,
if (0 > (rc = orte_rml.send_buffer_nb(peer, send_buf,
ORTE_RML_TAG_ALLGATHER_RCD,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(send_buf);
return rc;
};
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub receiving from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer)));
return ORTE_SUCCESS;
}
static void rcd_allgather_process_data(orte_grpcomm_coll_t *coll, uint32_t distance) {
/* Communication step:
At every step i, rank r:
- exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
*/
orte_process_name_t peer;
orte_vpid_t nv, rank;
uint32_t distance_index;
int rc;
peer.jobid = ORTE_PROC_MY_NAME->jobid;
/* get my own rank */
rank = ORTE_VPID_INVALID;
for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
rank = nv;
break;
}
}
/* check for bozo case */
if (ORTE_VPID_INVALID == rank) {
OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
"Peer not found"));
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rcd_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
return;
}
while(distance < coll->ndmns) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub process distance %u (mask recv: 0x%x)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
/* first send my current contents */
nv = rank ^ distance;
peer.vpid = coll->dmns[nv];
rcd_allgather_send_dist(coll, &peer, distance);
/* check whether data for next distance is available*/
distance_index = log2(distance);
if ((NULL != coll->buffers) && (NULL != coll->buffers[distance_index])) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub %u distance data found",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance));
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, coll->buffers[distance_index]))) {
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
coll->nreported += distance;
coll->distance_mask_recv |= (uint32_t)(1 << distance);
OBJ_RELEASE(coll->buffers[distance_index]);
coll->buffers[distance_index] = NULL;
distance = distance << 1;
continue;
}
break;
}
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub reported %lu process from %lu",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)coll->nreported,
(unsigned long)coll->ndmns));
/* if we are done, then complete things */
if (coll->nreported >= coll->ndmns){
rcd_finalize_coll(coll, ORTE_SUCCESS);
}
return;
}
static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
opal_buffer_t* buffer, orte_rml_tag_t tag,
void* cbdata)
{
int32_t cnt, num_remote;
int32_t cnt;
uint32_t distance, distance_index;
int rc;
orte_grpcomm_signature_t *sig;
orte_grpcomm_coll_t *coll;
orte_vpid_t distance, new_distance;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub received data",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
"%s grpcomm:coll:recdub RECEIVING FROM %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender)));
/* unpack the signature */
cnt = 1;
@ -213,40 +255,60 @@ static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
OBJ_RELEASE(sig);
return;
}
/* unpack the distance */
distance = 1;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) {
distance = 0;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_UINT32))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
assert(0 == (coll->distance_mask_recv & (uint32_t)(1 << distance)));
/* unpack number of reported */
num_remote = 0;
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
coll->nreported += num_remote;
/* capture any provided content */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
//update distance and send
new_distance = distance <<= 1;
if (new_distance < coll->ndmns) {
rcd_allgather_send_dist(coll, new_distance);
/* Check whether we can process next distance */
if (coll->distance_mask_recv & ((uint32_t)(1 << (distance >> 1)))) {
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub data from %d distance received, "
"Process the next distance (mask recv: 0x%x).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
/* capture any provided content */
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
coll->nreported += distance;
coll->distance_mask_recv |= (uint32_t)(1 << distance);
rcd_allgather_process_data(coll, (uint32_t)(distance << 1));
} else {
rcd_finalize_coll(coll, ORTE_SUCCESS);
OPAL_OUTPUT_VERBOSE((80, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub data from %d distance received, "
"still waiting for data (mask recv: 0x%x).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), distance, coll->distance_mask_recv));
if (NULL == coll->buffers) {
if (NULL == (coll->buffers = (opal_buffer_t **)calloc(sizeof(opal_buffer_t *), log2(coll->ndmns)))) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
}
distance_index = log2(distance);
if (NULL == (coll->buffers[distance_index] = OBJ_NEW(opal_buffer_t))) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(coll->buffers[distance_index], buffer))) {
OBJ_RELEASE(sig);
ORTE_ERROR_LOG(rc);
rcd_finalize_coll(coll, rc);
return;
}
}
OBJ_RELEASE(sig);
@ -258,6 +320,10 @@ static int rcd_finalize_coll(orte_grpcomm_coll_t *coll, int ret) {
opal_buffer_t *reply;
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
"%s grpcomm:coll:recdub declared collective complete",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
reply = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != (rc = opal_dss.pack(reply, &coll->nreported, 1, OPAL_UINT64))) {

Просмотреть файл

@ -49,14 +49,14 @@ int orte_dt_compare_name(orte_process_name_t *value1,
} else if (NULL == value2) {
return OPAL_VALUE1_GREATER;
}
/* If any of the fields are wildcard,
* then we want to just ignore that one field. In the case
* of ORTE_NAME_WILDCARD (where ALL of the fields are wildcard), this
* will automatically result in OPAL_EQUAL for any name in the other
* value - a totally useless result, but consistent in behavior.
*/
/** check the jobids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
@ -68,7 +68,7 @@ int orte_dt_compare_name(orte_process_name_t *value1,
return OPAL_VALUE1_GREATER;
}
}
/** check the vpids - if one of them is WILDCARD, then ignore
* this field since anything is okay
*/
@ -92,11 +92,11 @@ int orte_dt_compare_vpid(orte_vpid_t *value1,
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_VPID_WILDCARD ||
*value2 == ORTE_VPID_WILDCARD) return OPAL_EQUAL;
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -107,11 +107,11 @@ int orte_dt_compare_jobid(orte_jobid_t *value1,
/** if either value is WILDCARD, then return equal */
if (*value1 == ORTE_JOBID_WILDCARD ||
*value2 == ORTE_JOBID_WILDCARD) return OPAL_EQUAL;
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -133,12 +133,12 @@ int orte_dt_compare_job(orte_job_t *value1, orte_job_t *value2, opal_data_type_t
int orte_dt_compare_node(orte_node_t *value1, orte_node_t *value2, opal_data_type_t type)
{
int test;
/** check node names */
test = strcmp(value1->name, value2->name);
if (0 == test) return OPAL_EQUAL;
if (0 < test) return OPAL_VALUE2_GREATER;
return OPAL_VALUE1_GREATER;
}
@ -162,7 +162,7 @@ int orte_dt_compare_app_context(orte_app_context_t *value1, orte_app_context_t *
{
if (value1->idx > value2->idx) return OPAL_VALUE1_GREATER;
if (value2->idx > value1->idx) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -174,9 +174,9 @@ int orte_dt_compare_exit_code(orte_exit_code_t *value1,
opal_data_type_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -188,9 +188,9 @@ int orte_dt_compare_node_state(orte_node_state_t *value1,
orte_node_state_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -202,9 +202,9 @@ int orte_dt_compare_proc_state(orte_proc_state_t *value1,
orte_proc_state_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -216,9 +216,9 @@ int orte_dt_compare_job_state(orte_job_state_t *value1,
orte_job_state_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -248,9 +248,9 @@ int orte_dt_compare_tags(orte_rml_tag_t *value1, orte_rml_tag_t *value2, opal_da
int orte_dt_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_flag_t *value2, opal_data_type_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -258,9 +258,9 @@ int orte_dt_compare_daemon_cmd(orte_daemon_cmd_flag_t *value1, orte_daemon_cmd_f
int orte_dt_compare_iof_tag(orte_iof_tag_t *value1, orte_iof_tag_t *value2, opal_data_type_t type)
{
if (*value1 > *value2) return OPAL_VALUE1_GREATER;
if (*value2 > *value1) return OPAL_VALUE2_GREATER;
return OPAL_EQUAL;
}
@ -285,11 +285,16 @@ int orte_dt_compare_sig(orte_grpcomm_signature_t *value1, orte_grpcomm_signature
}
if (value2->sz > value1->sz) {
return OPAL_VALUE2_GREATER;
}
if (value1->seq_num > value2->seq_num) {
return OPAL_VALUE1_GREATER;
}
if (value2->seq_num > value1->seq_num) {
return OPAL_VALUE2_GREATER;
}
/* same size - check contents */
if (0 == memcmp(value1->signature, value2->signature, value1->sz*sizeof(orte_process_name_t))) {
return OPAL_EQUAL;
}
return OPAL_VALUE2_GREATER;
}

Просмотреть файл

@ -36,19 +36,19 @@
#include "orte/runtime/data_type_support/orte_dt_support.h"
/* ORTE_STD_CNTR */
int orte_dt_copy_std_cntr(orte_std_cntr_t **dest, orte_std_cntr_t *src, opal_data_type_t type)
int orte_dt_copy_std_cntr(orte_std_cntr_t **dest, orte_std_cntr_t *src, opal_data_type_t type)
{
orte_std_cntr_t *val;
val = (orte_std_cntr_t*)malloc(sizeof(orte_std_cntr_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*val = *src;
*dest = val;
return ORTE_SUCCESS;
}
@ -56,16 +56,16 @@ int orte_dt_copy_std_cntr(orte_std_cntr_t **dest, orte_std_cntr_t *src, opal_dat
int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal_data_type_t type)
{
orte_process_name_t *val;
val = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
val->jobid = src->jobid;
val->vpid = src->vpid;
*dest = val;
return ORTE_SUCCESS;
}
@ -76,16 +76,16 @@ int orte_dt_copy_name(orte_process_name_t **dest, orte_process_name_t *src, opal
int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, opal_data_type_t type)
{
orte_jobid_t *val;
val = (orte_jobid_t*)malloc(sizeof(orte_jobid_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*val = *src;
*dest = val;
return ORTE_SUCCESS;
}
@ -95,16 +95,16 @@ int orte_dt_copy_jobid(orte_jobid_t **dest, orte_jobid_t *src, opal_data_type_t
int orte_dt_copy_vpid(orte_vpid_t **dest, orte_vpid_t *src, opal_data_type_t type)
{
orte_vpid_t *val;
val = (orte_vpid_t*)malloc(sizeof(orte_vpid_t));
if (NULL == val) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*val = *src;
*dest = val;
return ORTE_SUCCESS;
}
@ -115,7 +115,7 @@ int orte_dt_copy_job(orte_job_t **dest, orte_job_t *src, opal_data_type_t type)
{
(*dest) = src;
OBJ_RETAIN(src);
return ORTE_SUCCESS;
}
@ -152,7 +152,7 @@ int orte_dt_copy_app_context(orte_app_context_t **dest, orte_app_context_t *src,
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* copy data into it */
(*dest)->idx = src->idx;
if (NULL != src->app) {
@ -176,64 +176,64 @@ int orte_dt_copy_app_context(orte_app_context_t **dest, orte_app_context_t *src,
int orte_dt_copy_proc_state(orte_proc_state_t **dest, orte_proc_state_t *src, opal_data_type_t type)
{
orte_proc_state_t *ps;
ps = (orte_proc_state_t*)malloc(sizeof(orte_proc_state_t));
if (NULL == ps) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*ps = *src;
*dest = ps;
return ORTE_SUCCESS;
}
int orte_dt_copy_job_state(orte_job_state_t **dest, orte_job_state_t *src, opal_data_type_t type)
{
orte_job_state_t *ps;
ps = (orte_job_state_t*)malloc(sizeof(orte_job_state_t));
if (NULL == ps) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*ps = *src;
*dest = ps;
return ORTE_SUCCESS;
}
int orte_dt_copy_node_state(orte_node_state_t **dest, orte_node_state_t *src, opal_data_type_t type)
{
orte_node_state_t *ps;
ps = (orte_node_state_t*)malloc(sizeof(orte_node_state_t));
if (NULL == ps) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*ps = *src;
*dest = ps;
return ORTE_SUCCESS;
}
int orte_dt_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, opal_data_type_t type)
{
orte_exit_code_t *ps;
ps = (orte_exit_code_t*)malloc(sizeof(orte_exit_code_t));
if (NULL == ps) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
*ps = *src;
*dest = ps;
return ORTE_SUCCESS;
}
@ -243,19 +243,19 @@ int orte_dt_copy_exit_code(orte_exit_code_t **dest, orte_exit_code_t *src, opal_
int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_t type)
{
orte_std_cntr_t i;
if (NULL == src) {
*dest = NULL;
return ORTE_SUCCESS;
}
/* create the new object */
*dest = OBJ_NEW(orte_job_map_t);
if (NULL == *dest) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* copy data into it */
(*dest)->mapping = src->mapping;
(*dest)->ranking = src->ranking;
@ -269,7 +269,7 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
(*dest)->num_new_daemons = src->num_new_daemons;
(*dest)->daemon_vpid_start = src->daemon_vpid_start;
(*dest)->num_nodes = src->num_nodes;
/* copy the pointer array - have to do this manually
* as no dss.copy function is setup for that object
*/
@ -281,7 +281,7 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
for (i=0; i < src->nodes->size; i++) {
(*dest)->nodes->addr[i] = src->nodes->addr[i];
}
return ORTE_SUCCESS;
}
@ -291,57 +291,57 @@ int orte_dt_copy_map(orte_job_map_t **dest, orte_job_map_t *src, opal_data_type_
int orte_dt_copy_tag(orte_rml_tag_t **dest, orte_rml_tag_t *src, opal_data_type_t type)
{
orte_rml_tag_t *tag;
if (NULL == src) {
*dest = NULL;
return ORTE_SUCCESS;
}
/* create the new space */
tag = (orte_rml_tag_t*)malloc(sizeof(orte_rml_tag_t));
if (NULL == tag) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* copy data into it */
*tag = *src;
*dest = tag;
return ORTE_SUCCESS;
}
int orte_dt_copy_daemon_cmd(orte_daemon_cmd_flag_t **dest, orte_daemon_cmd_flag_t *src, opal_data_type_t type)
{
size_t datasize;
datasize = sizeof(orte_daemon_cmd_flag_t);
*dest = (orte_daemon_cmd_flag_t*)malloc(datasize);
if (NULL == *dest) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memcpy(*dest, src, datasize);
return ORTE_SUCCESS;
}
int orte_dt_copy_iof_tag(orte_iof_tag_t **dest, orte_iof_tag_t *src, opal_data_type_t type)
{
size_t datasize;
datasize = sizeof(orte_iof_tag_t);
*dest = (orte_iof_tag_t*)malloc(datasize);
if (NULL == *dest) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memcpy(*dest, src, datasize);
return ORTE_SUCCESS;
}
@ -356,7 +356,7 @@ int orte_dt_copy_attr(orte_attribute_t **dest, orte_attribute_t *src, opal_data_
(*dest)->type = src->type;
memcpy(&(*dest)->data, &src->data, sizeof(src->data));
return ORTE_SUCCESS;
}
@ -369,12 +369,12 @@ int orte_dt_copy_sig(orte_grpcomm_signature_t **dest, orte_grpcomm_signature_t *
}
(*dest)->sz = src->sz;
(*dest)->signature = (orte_process_name_t*)malloc(src->sz * sizeof(orte_process_name_t));
(*dest)->seq_num = src->seq_num;
if (NULL == (*dest)->signature) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
OBJ_RELEASE(*dest);
return ORTE_ERR_OUT_OF_RESOURCE;
}
memcpy(&(*dest)->signature, &src->signature, src->sz * sizeof(orte_process_name_t));
memcpy((*dest)->signature, src->signature, src->sz * sizeof(orte_process_name_t));
return ORTE_SUCCESS;
}

Просмотреть файл

@ -43,13 +43,13 @@ int orte_dt_pack_std_cntr(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_STD_CNTR_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -64,7 +64,7 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src,
orte_process_name_t* proc;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
/* collect all the jobids in a contiguous array */
jobid = (orte_jobid_t*)malloc(num_vals * sizeof(orte_jobid_t));
if (NULL == jobid) {
@ -84,7 +84,7 @@ int orte_dt_pack_name(opal_buffer_t *buffer, const void *src,
return rc;
}
free(jobid);
/* collect all the vpids in a contiguous array */
vpid = (orte_vpid_t*)malloc(num_vals * sizeof(orte_vpid_t));
if (NULL == vpid) {
@ -115,13 +115,13 @@ int orte_dt_pack_jobid(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_JOBID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -132,13 +132,13 @@ int orte_dt_pack_vpid(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (
ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_VPID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -176,7 +176,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* if there are apps, pack the app_contexts */
if (0 < jobs[i]->num_apps) {
for (j=0; j < jobs[i]->apps->size; j++) {
@ -189,7 +189,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
}
}
}
/* pack the number of procs and offset */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->num_procs)), 1, ORTE_VPID))) {
@ -220,14 +220,14 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the total slots allocated to the job */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->total_slots_alloc)), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if the map is NULL, then we cannot pack it as there is
* nothing to pack. However, we have to flag whether or not
* the map is included so the unpacking routine can know
@ -244,7 +244,7 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the map - this will only pack the fields that control
* HOW a job is to be mapped. We do -not- pack the mapped procs
* or nodes as this info does not need to be transmitted
@ -256,23 +256,23 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src,
return rc;
}
}
/* do not pack the bookmark or oversubscribe_override flags */
/* pack the job state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->state)), 1, ORTE_JOB_STATE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the flags */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(jobs[i]->flags)), 1, ORTE_JOB_FLAGS_T))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the attributes that need to be sent */
count = 0;
OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) {
@ -310,33 +310,33 @@ int orte_dt_pack_node(opal_buffer_t *buffer, const void *src,
/* array of pointers to orte_node_t objects - need to pack the objects a set of fields at a time */
nodes = (orte_node_t**) src;
for (i=0; i < num_vals; i++) {
/* do not pack the index - it is meaningless on the other end */
/* pack the node name */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(nodes[i]->name)), 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not pack the daemon name or launch id */
/* pack the number of procs on the node */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(nodes[i]->num_procs)), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not pack the procs */
/* pack whether we are oversubscribed or not */
flag = ORTE_FLAG_TEST(nodes[i], ORTE_NODE_FLAG_OVERSUBSCRIBED);
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&flag), 1, OPAL_UINT8))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&(nodes[i]->state)), 1, ORTE_NODE_STATE))) {
ORTE_ERROR_LOG(rc);
@ -379,7 +379,7 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
/* array of pointers to orte_proc_t objects - need to pack the objects a set of fields at a time */
procs = (orte_proc_t**) src;
for (i=0; i < num_vals; i++) {
/* pack the name */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
@ -387,7 +387,7 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the daemon/node it is on */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->parent)), 1, ORTE_VPID))) {
@ -401,21 +401,21 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the node rank */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->node_rank)), 1, ORTE_NODE_RANK))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the state */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->state)), 1, ORTE_PROC_STATE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the app context index */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer,
(void*)(&(procs[i]->app_idx)), 1, ORTE_STD_CNTR))) {
@ -443,7 +443,7 @@ int orte_dt_pack_proc(opal_buffer_t *buffer, const void *src,
}
}
}
return ORTE_SUCCESS;
}
@ -549,7 +549,7 @@ int orte_dt_pack_app_context(opal_buffer_t *buffer, const void *src,
}
}
}
return ORTE_SUCCESS;
}
@ -560,11 +560,11 @@ int orte_dt_pack_exit_code(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_EXIT_CODE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -575,11 +575,11 @@ int orte_dt_pack_node_state(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_NODE_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -590,11 +590,11 @@ int orte_dt_pack_proc_state(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_PROC_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -605,11 +605,11 @@ int orte_dt_pack_job_state(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_JOB_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -624,10 +624,10 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
int rc;
int32_t i;
orte_job_map_t **maps;
/* array of pointers to orte_job_map_t objects - need to pack the objects a set of fields at a time */
maps = (orte_job_map_t**) src;
for (i=0; i < num_vals; i++) {
/* pack the requested mapper */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->req_mapper), 1, OPAL_STRING))) {
@ -655,7 +655,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the display map flag */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->display_map), 1, OPAL_BOOL))) {
ORTE_ERROR_LOG(rc);
@ -668,7 +668,7 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src,
}
}
return ORTE_SUCCESS;
}
@ -679,12 +679,12 @@ int orte_dt_pack_tag(opal_buffer_t *buffer, const void *src,
int32_t num_vals, opal_data_type_t type)
{
int rc;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_RML_TAG_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -695,12 +695,12 @@ int orte_dt_pack_daemon_cmd(opal_buffer_t *buffer, const void *src, int32_t num_
opal_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_DAEMON_CMD_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -711,12 +711,12 @@ int orte_dt_pack_iof_tag(opal_buffer_t *buffer, const void *src, int32_t num_val
opal_data_type_t type)
{
int ret;
/* Turn around and pack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_pack_buffer(buffer, src, num_vals, ORTE_IOF_TAG_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -732,7 +732,7 @@ int orte_dt_pack_attr(opal_buffer_t *buffer, const void *src, int32_t num_vals,
int ret;
ptr = (orte_attribute_t **) src;
for (i = 0; i < num_vals; ++i) {
/* pack the key and type */
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->key, 1, ORTE_ATTR_KEY_T))) {
@ -873,13 +873,18 @@ int orte_dt_pack_sig(opal_buffer_t *buffer, const void *src, int32_t num_vals,
int rc;
ptr = (orte_grpcomm_signature_t **) src;
for (i = 0; i < num_vals; ++i) {
/* pack the #procs */
if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &ptr[i]->sz, 1, OPAL_SIZE))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* pack the sequence number */
if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &ptr[i]->seq_num, 1, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < ptr[i]->sz) {
/* pack the array */
if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, ptr[i]->signature, ptr[i]->sz, ORTE_NAME))) {

Просмотреть файл

@ -49,24 +49,24 @@ static void orte_dt_quick_print(char **output, char *type_name, char *prefix, vo
uint16_t *ui16;
uint32_t *ui32;
uint64_t *ui64;
/* set default result */
*output = NULL;
/* check for NULL ptr */
if (NULL == src) {
asprintf(output, "%sData type: %s\tData size: 8-bit\tValue: NULL pointer",
(NULL == prefix) ? "" : prefix, type_name);
return;
}
switch(real_type) {
case OPAL_INT8:
i8 = (int8_t*)src;
asprintf(output, "%sData type: %s\tData size: 8-bit\tValue: %d",
(NULL == prefix) ? "" : prefix, type_name, (int) *i8);
break;
case OPAL_UINT8:
ui8 = (uint8_t*)src;
asprintf(output, "%sData type: %s\tData size: 8-bit\tValue: %u",
@ -75,40 +75,40 @@ static void orte_dt_quick_print(char **output, char *type_name, char *prefix, vo
case OPAL_INT16:
i16 = (int16_t*)src;
asprintf(output, "%sData type: %s\tData size: 16-bit\tValue: %d",
asprintf(output, "%sData type: %s\tData size: 16-bit\tValue: %d",
(NULL == prefix) ? "" : prefix, type_name, (int) *i16);
break;
case OPAL_UINT16:
ui16 = (uint16_t*)src;
asprintf(output, "%sData type: %s\tData size: 16-bit\tValue: %u",
asprintf(output, "%sData type: %s\tData size: 16-bit\tValue: %u",
(NULL == prefix) ? "" : prefix, type_name, (unsigned int) *ui16);
break;
case OPAL_INT32:
i32 = (int32_t*)src;
asprintf(output, "%sData type: %s\tData size: 32-bit\tValue: %ld",
(NULL == prefix) ? "" : prefix, type_name, (long) *i32);
break;
case OPAL_UINT32:
ui32 = (uint32_t*)src;
asprintf(output, "%sData type: %s\tData size: 32-bit\tValue: %lu",
(NULL == prefix) ? "" : prefix, type_name, (unsigned long) *ui32);
break;
case OPAL_INT64:
i64 = (int64_t*)src;
asprintf(output, "%sData type: %s\tData size: 64-bit\tValue: %ld",
(NULL == prefix) ? "" : prefix, type_name, (long) *i64);
break;
case OPAL_UINT64:
ui64 = (uint64_t*)src;
asprintf(output, "%sData type: %s\tData size: 64-bit\tValue: %lu",
(NULL == prefix) ? "" : prefix, type_name, (unsigned long) *ui64);
break;
default:
return;
}
@ -123,7 +123,7 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t
{
/* set default result */
*output = NULL;
switch(type) {
case ORTE_STD_CNTR:
orte_dt_quick_print(output, "ORTE_STD_CNTR", prefix, src, ORTE_STD_CNTR_T);
@ -132,33 +132,33 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t
case ORTE_VPID:
orte_dt_quick_print(output, "ORTE_VPID", prefix, src, ORTE_VPID_T);
break;
case ORTE_JOBID:
asprintf(output, "%sData Type: ORTE_JOBID\tData size: %lu\tValue: %s",
(NULL == prefix) ? "" : prefix, (unsigned long)sizeof(orte_jobid_t),
ORTE_JOBID_PRINT(*(orte_jobid_t*)src));
break;
case ORTE_PROC_STATE:
orte_dt_quick_print(output, "ORTE_PROC_STATE", prefix, src, ORTE_PROC_STATE_T);
break;
case ORTE_JOB_STATE:
orte_dt_quick_print(output, "ORTE_JOB_STATE", prefix, src, ORTE_JOB_STATE_T);
break;
case ORTE_NODE_STATE:
orte_dt_quick_print(output, "ORTE_NODE_STATE", prefix, src, ORTE_NODE_STATE_T);
break;
case ORTE_EXIT_CODE:
orte_dt_quick_print(output, "ORTE_EXIT_CODE", prefix, src, ORTE_EXIT_CODE_T);
break;
case ORTE_RML_TAG:
orte_dt_quick_print(output, "ORTE_RML_TAG", prefix, src, ORTE_RML_TAG_T);
break;
case ORTE_DAEMON_CMD:
orte_dt_quick_print(output, "ORTE_DAEMON_CMD", prefix, src, ORTE_DAEMON_CMD_T);
break;
@ -166,12 +166,12 @@ int orte_dt_std_print(char **output, char *prefix, void *src, opal_data_type_t t
case ORTE_IOF_TAG:
orte_dt_quick_print(output, "ORTE_IOF_TAG", prefix, src, ORTE_IOF_TAG_T);
break;
default:
ORTE_ERROR_LOG(ORTE_ERR_UNKNOWN_DATA_TYPE);
return ORTE_ERR_UNKNOWN_DATA_TYPE;
}
return ORTE_SUCCESS;
}
@ -182,7 +182,7 @@ int orte_dt_print_name(char **output, char *prefix, orte_process_name_t *name, o
{
/* set default result */
*output = NULL;
if (NULL == name) {
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: NULL",
(NULL == prefix ? " " : prefix));
@ -190,7 +190,7 @@ int orte_dt_print_name(char **output, char *prefix, orte_process_name_t *name, o
asprintf(output, "%sData type: ORTE_PROCESS_NAME\tData Value: %s",
(NULL == prefix ? " " : prefix), ORTE_NAME_PRINT(name));
}
return ORTE_SUCCESS;
}
@ -225,7 +225,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
orte_job_state_to_str(src->state), (ORTE_FLAG_TEST(src, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False");
asprintf(&pfx, "%s\t", pfx2);
free(pfx2);
for (i=0; i < src->apps->size; i++) {
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(src->apps, i))) {
continue;
@ -236,7 +236,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
free(tmp2);
tmp = tmp3;
}
if (NULL != src->map) {
if (ORTE_SUCCESS != (rc = opal_dss.print(&tmp2, pfx, src->map, ORTE_JOB_MAP))) {
ORTE_ERROR_LOG(rc);
@ -251,7 +251,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
free(tmp);
tmp = tmp2;
}
asprintf(&tmp2, "%s\n%sNum procs: %ld\tOffset: %ld", tmp, pfx, (long)src->num_procs, (long)src->offset);
free(tmp);
tmp = tmp2;
@ -275,7 +275,7 @@ int orte_dt_print_job(char **output, char *prefix, orte_job_t *src, opal_data_ty
(long)src->num_terminated);
free(tmp);
tmp = tmp2;
/* set the return */
*output = tmp;
free(pfx);
@ -295,7 +295,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
char **alias;
/* set default result */
*output = NULL;
/* protect against NULL prefix */
if (NULL == prefix) {
asprintf(&pfx2, " ");
@ -326,7 +326,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
free(pfx2);
return ORTE_SUCCESS;
}
if (!orte_devel_level_output) {
/* just provide a simple output for users */
if (0 == src->num_procs) {
@ -371,7 +371,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
}
goto PRINT_PROCS;
}
asprintf(&tmp, "\n%sData for node: %s\tState: %0x",
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state);
/* does this node have any aliases? */
@ -388,7 +388,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
if (NULL != tmp3) {
free(tmp3);
}
if (NULL == src->daemon) {
asprintf(&tmp2, "%s\n%s\tDaemon: %s\tDaemon launched: %s", tmp, pfx2,
"Not defined", ORTE_FLAG_TEST(src, ORTE_NODE_FLAG_DAEMON_LAUNCHED) ? "True" : "False");
@ -399,18 +399,18 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
}
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots: %ld\tSlots in use: %ld\tOversubscribed: %s", tmp, pfx2,
(long)src->slots, (long)src->slots_inuse,
ORTE_FLAG_TEST(src, ORTE_NODE_FLAG_OVERSUBSCRIBED) ? "TRUE" : "FALSE");
free(tmp);
tmp = tmp2;
asprintf(&tmp2, "%s\n%s\tNum slots allocated: %ld\tMax slots: %ld", tmp, pfx2,
(long)src->slots, (long)src->slots_max);
free(tmp);
tmp = tmp2;
tmp3 = NULL;
if (orte_get_attribute(&src->attributes, ORTE_NODE_USERNAME, (void**)&tmp3, OPAL_STRING)) {
asprintf(&tmp2, "%s\n%s\tUsername on node: %s", tmp, pfx2, tmp3);
@ -418,14 +418,14 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
free(tmp);
tmp = tmp2;
}
#if OPAL_HAVE_HWLOC
if (orte_display_topo_with_map && NULL != src->topology) {
char *pfx3;
asprintf(&tmp2, "%s\n%s\tDetected Resources:\n", tmp, pfx2);
free(tmp);
tmp = tmp2;
tmp2 = NULL;
asprintf(&pfx3, "%s\t\t", pfx2);
opal_dss.print(&tmp2, pfx3, src->topology, OPAL_HWLOC_TOPO);
@ -433,7 +433,7 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
asprintf(&tmp3, "%s%s", tmp, tmp2);
free(tmp);
free(tmp2);
tmp = tmp3;
tmp = tmp3;
}
#endif
@ -441,11 +441,11 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
(long)src->num_procs, (long)src->next_node_rank);
free(tmp);
tmp = tmp2;
PRINT_PROCS:
asprintf(&pfx, "%s\t", pfx2);
free(pfx2);
for (i=0; i < src->procs->size; i++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(src->procs, i))) {
continue;
@ -460,10 +460,10 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
tmp = tmp3;
}
free(pfx);
/* set the return */
*output = tmp;
return ORTE_SUCCESS;
}
@ -476,14 +476,14 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
/* set default result */
*output = NULL;
/* protect against NULL prefix */
if (NULL == prefix) {
asprintf(&pfx2, " ");
} else {
asprintf(&pfx2, "%s", prefix);
}
if (orte_xml_output) {
/* need to create the output in XML format */
if (0 == src->pid) {
@ -496,7 +496,7 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
free(pfx2);
return ORTE_SUCCESS;
}
if (!orte_devel_level_output) {
#if OPAL_HAVE_HWLOC
{
@ -544,12 +544,12 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
}
asprintf(&tmp, "\n%sData for proc: %s", pfx2, ORTE_NAME_PRINT(&src->name));
asprintf(&tmp2, "%s\n%s\tPid: %ld\tLocal rank: %lu\tNode rank: %lu\tApp rank: %d", tmp, pfx2,
(long)src->pid, (unsigned long)src->local_rank, (unsigned long)src->node_rank, src->app_rank);
free(tmp);
tmp = tmp2;
#if OPAL_HAVE_HWLOC
{
char *locale=NULL;
@ -588,10 +588,10 @@ int orte_dt_print_proc(char **output, char *prefix, orte_proc_t *src, opal_data_
orte_proc_state_to_str(src->state), (long)src->app_idx);
#endif
free(tmp);
/* set the return */
*output = tmp2;
free(pfx2);
return ORTE_SUCCESS;
}
@ -607,34 +607,34 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
/* set default result */
*output = NULL;
/* protect against NULL prefix */
if (NULL == prefix) {
asprintf(&pfx2, " ");
} else {
asprintf(&pfx2, "%s", prefix);
}
asprintf(&tmp, "\n%sData for app_context: index %lu\tapp: %s\n%s\tNum procs: %lu\tFirstRank: %s",
pfx2, (unsigned long)src->idx,
(NULL == src->app) ? "NULL" : src->app,
pfx2, (unsigned long)src->num_procs,
ORTE_VPID_PRINT(src->first_rank));
count = opal_argv_count(src->argv);
for (i=0; i < count; i++) {
asprintf(&tmp2, "%s\n%s\tArgv[%d]: %s", tmp, pfx2, i, src->argv[i]);
free(tmp);
tmp = tmp2;
}
count = opal_argv_count(src->env);
for (i=0; i < count; i++) {
asprintf(&tmp2, "%s\n%s\tEnv[%lu]: %s", tmp, pfx2, (unsigned long)i, src->env[i]);
free(tmp);
tmp = tmp2;
}
tmp3 = NULL;
orte_get_attribute(&src->attributes, ORTE_APP_PREFIX_DIR, (void**)&tmp3, OPAL_STRING);
asprintf(&tmp2, "%s\n%s\tWorking dir: %s\n%s\tPrefix: %s\n%s\tUsed on node: %s", tmp,
@ -643,7 +643,7 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
pfx2, ORTE_FLAG_TEST(src, ORTE_APP_FLAG_USED_ON_NODE) ? "TRUE" : "FALSE");
free(tmp);
tmp = tmp2;
OPAL_LIST_FOREACH(kv, &src->attributes, opal_value_t) {
opal_dss.print(&tmp2, pfx2, kv, ORTE_ATTRIBUTE);
asprintf(&tmp3, "%s\n%s", tmp, tmp2);
@ -654,7 +654,7 @@ int orte_dt_print_app_context(char **output, char *prefix, orte_app_context_t *s
/* set the return */
*output = tmp;
free(pfx2);
return ORTE_SUCCESS;
}
@ -669,17 +669,17 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
int rc;
orte_node_t *node;
orte_proc_t *proc;
/* set default result */
*output = NULL;
/* protect against NULL prefix */
if (NULL == prefix) {
asprintf(&pfx2, " ");
} else {
asprintf(&pfx2, "%s", prefix);
}
if (orte_xml_output) {
/* need to create the output in XML format */
asprintf(&tmp, "<map>\n");
@ -713,11 +713,11 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
free(pfx2);
*output = tmp2;
return ORTE_SUCCESS;
}
asprintf(&pfx, "%s\t", pfx2);
if (orte_devel_level_output) {
#if OPAL_HAVE_HWLOC
asprintf(&tmp, "\n%sMapper requested: %s Last mapper: %s Mapping policy: %s Ranking policy: %s\n%sBinding policy: %s Cpu set: %s PPR: %s Cpus-per-rank: %d",
@ -753,8 +753,8 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
/* this is being printed for a user, so let's make it easier to see */
asprintf(&tmp, "\n%s======================== JOB MAP ========================", pfx2);
}
for (i=0; i < src->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(src->nodes, i))) {
continue;
@ -770,7 +770,7 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
free(tmp2);
tmp = tmp3;
}
if (!orte_devel_level_output) {
/* this is being printed for a user, so let's make it easier to see */
asprintf(&tmp2, "%s\n\n%s=============================================================\n", tmp, pfx2);
@ -778,10 +778,10 @@ int orte_dt_print_map(char **output, char *prefix, orte_job_map_t *src, opal_dat
tmp = tmp2;
}
free(pfx2);
/* set the return */
*output = tmp;
free(pfx);
return ORTE_SUCCESS;
}
@ -795,14 +795,14 @@ int orte_dt_print_attr(char **output, char *prefix,
/* deal with NULL prefix */
if (NULL == prefix) asprintf(&prefx, " ");
else prefx = strdup(prefix);
/* if src is NULL, just print data type and return */
if (NULL == src) {
asprintf(output, "%sData type: ORTE_ATTR\tValue: NULL pointer", prefx);
free(prefx);
return OPAL_SUCCESS;
}
switch (src->type) {
case OPAL_STRING:
asprintf(output, "%sORTE_ATTR: Local: %s Data type: OPAL_STRING\tKey: %s\tValue: %s",
@ -902,7 +902,7 @@ int orte_dt_print_sig(char **output, char *prefix, orte_grpcomm_signature_t *src
/* deal with NULL prefix */
if (NULL == prefix) asprintf(&prefx, " ");
else prefx = strdup(prefix);
/* if src is NULL, just print data type and return */
if (NULL == src) {
asprintf(output, "%sData type: ORTE_SIG\tValue: NULL pointer", prefx);
@ -917,7 +917,7 @@ int orte_dt_print_sig(char **output, char *prefix, orte_grpcomm_signature_t *src
}
/* there must be at least one */
asprintf(&tmp, "%sORTE_SIG\tValue: ", prefx);
asprintf(&tmp, "%sORTE_SIG\tSeqNumber:%d\tValue: ", prefx, src->seq_num);
for (i=0; i < src->sz; i++) {
asprintf(&tmp2, "%s%s", tmp, ORTE_NAME_PRINT(&src->signature[i]));

Просмотреть файл

@ -39,12 +39,12 @@ int orte_dt_unpack_std_cntr(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_STD_CNTR_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -59,9 +59,9 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
orte_process_name_t* proc;
orte_jobid_t *jobid;
orte_vpid_t *vpid;
num = *num_vals;
/* allocate space for all the jobids in a contiguous array */
jobid = (orte_jobid_t*)malloc(num * sizeof(orte_jobid_t));
if (NULL == jobid) {
@ -77,7 +77,7 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
free(jobid);
return rc;
}
/* collect all the vpids in a contiguous array */
vpid = (orte_vpid_t*)malloc(num * sizeof(orte_vpid_t));
if (NULL == vpid) {
@ -95,7 +95,7 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
free(jobid);
return rc;
}
/* build the names from the jobid/vpid arrays */
proc = (orte_process_name_t*)dest;
for (i=0; i < num; i++) {
@ -103,11 +103,11 @@ int orte_dt_unpack_name(opal_buffer_t *buffer, void *dest,
proc->vpid = vpid[i];
proc++;
}
/* cleanup */
free(vpid);
free(jobid);
return ORTE_SUCCESS;
}
@ -118,12 +118,12 @@ int orte_dt_unpack_jobid(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_JOBID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -134,12 +134,12 @@ int orte_dt_unpack_vpid(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_VPID_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -186,7 +186,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* if there are apps, unpack them */
if (0 < jobs[i]->num_apps) {
orte_app_context_t *app;
@ -200,7 +200,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
opal_pointer_array_add(jobs[i]->apps, app);
}
}
/* unpack num procs and offset */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -235,7 +235,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the total slots allocated to the job */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -243,7 +243,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* if the map is NULL, then we din't pack it as there was
* nothing to pack. Instead, we packed a flag to indicate whether or not
* the map is included */
@ -262,9 +262,9 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
return rc;
}
}
/* no bookmark of oversubscribe_override flags to unpack */
/* unpack the job state */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -272,7 +272,7 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the flags */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -318,16 +318,16 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
/* unpack into array of orte_node_t objects */
nodes = (orte_node_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the node object */
nodes[i] = OBJ_NEW(orte_node_t);
if (NULL == nodes[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* do not unpack the index - meaningless here */
/* unpack the node name */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -335,9 +335,9 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not unpack the daemon name or launch id */
/* unpack the number of procs on the node */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -345,9 +345,9 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* do not unpack the proc info */
/* unpack whether we are oversubscribed */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -366,7 +366,7 @@ int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the attributes */
n=1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
@ -398,18 +398,18 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
int32_t i, n, count, k;
orte_attribute_t *kv;;
orte_proc_t **procs;
/* unpack into array of orte_proc_t objects */
procs = (orte_proc_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the orte_proc_t object */
procs[i] = OBJ_NEW(orte_proc_t);
if (NULL == procs[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the name */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -417,7 +417,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the node it is on */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -425,7 +425,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the local rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -433,7 +433,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the node rank */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -441,7 +441,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the state */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -449,7 +449,7 @@ int orte_dt_unpack_proc(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the app context index */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -619,11 +619,11 @@ int orte_dt_unpack_exit_code(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_EXIT_CODE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -634,11 +634,11 @@ int orte_dt_unpack_node_state(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_NODE_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -649,11 +649,11 @@ int orte_dt_unpack_proc_state(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_PROC_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -664,11 +664,11 @@ int orte_dt_unpack_job_state(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int rc;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_JOB_STATE_T))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
@ -683,18 +683,18 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
int rc;
int32_t i, n;
orte_job_map_t **maps;
/* unpack into array of orte_job_map_t objects */
maps = (orte_job_map_t**) dest;
for (i=0; i < *num_vals; i++) {
/* create the orte_rmaps_base_map_t object */
maps[i] = OBJ_NEW(orte_job_map_t);
if (NULL == maps[i]) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* unpack the requested mapper */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -702,7 +702,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the policies */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -731,7 +731,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
ORTE_ERROR_LOG(rc);
return rc;
}
/* unpack the display map flag */
n = 1;
if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
@ -747,7 +747,7 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest,
return rc;
}
}
return ORTE_SUCCESS;
}
@ -758,12 +758,12 @@ int orte_dt_unpack_tag(opal_buffer_t *buffer, void *dest,
int32_t *num_vals, opal_data_type_t type)
{
int ret;
/* Turn around and unpack the real type */
if (ORTE_SUCCESS != (ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_RML_TAG_T))) {
ORTE_ERROR_LOG(ret);
}
return ret;
}
@ -774,10 +774,10 @@ int orte_dt_unpack_daemon_cmd(opal_buffer_t *buffer, void *dest, int32_t *num_va
opal_data_type_t type)
{
int ret;
/* turn around and unpack the real type */
ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_DAEMON_CMD_T);
return ret;
}
@ -788,10 +788,10 @@ int orte_dt_unpack_iof_tag(opal_buffer_t *buffer, void *dest, int32_t *num_vals,
opal_data_type_t type)
{
int ret;
/* turn around and unpack the real type */
ret = opal_dss_unpack_buffer(buffer, dest, num_vals, ORTE_IOF_TAG_T);
return ret;
}
@ -808,7 +808,7 @@ int orte_dt_unpack_attr(opal_buffer_t *buffer, void *dest, int32_t *num_vals,
ptr = (orte_attribute_t **) dest;
n = *num_vals;
for (i = 0; i < n; ++i) {
/* allocate the new object */
ptr[i] = OBJ_NEW(orte_attribute_t);
@ -959,7 +959,7 @@ int orte_dt_unpack_sig(opal_buffer_t *buffer, void *dest, int32_t *num_vals,
ptr = (orte_grpcomm_signature_t **) dest;
n = *num_vals;
for (i = 0; i < n; ++i) {
/* allocate the new object */
ptr[i] = OBJ_NEW(orte_grpcomm_signature_t);
@ -972,6 +972,10 @@ int orte_dt_unpack_sig(opal_buffer_t *buffer, void *dest, int32_t *num_vals,
ORTE_ERROR_LOG(rc);
return rc;
}
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ptr[i]->seq_num, &cnt, OPAL_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (0 < ptr[i]->sz) {
/* allocate space for the array */
ptr[i]->signature = (orte_process_name_t*)malloc(ptr[i]->sz * sizeof(orte_process_name_t));