
The problem was tracked to use of the grpcomm.onesided_barrier to control daemon/mpirun termination. This relied on messaging -and- required that the program counter jump from the errmgr back to grpcomm. On rare occasions, this jump did not occur, causing mpirun to hang. This patch looks more invasive than it is - most of the affected files simply had one or two lines removed. The essence of the change is: * pulled the job_complete and quit routines out of orterun and orted_main and put them in a common place * modified the errmgr to directly call the new routines when termination is detected * removed the grpcomm.onesided_barrier and its associated RML tag * add a new "num_routes" API to the routed framework that reports back the number of dependent routes. When route_lost is called, the daemon's list of "children" is checked and adjusted if that route went to a "leaf" in the routing tree * use connection termination between daemons to track rollup of the daemon tree. Daemons and HNP now terminate once num_routes returns zero Also picked up in this commit is the addition of a new bool flag to the app_context struct, and increasing the job_control field from 8 to 16 bits. Both trivial. This commit was SVN r23429.
552 строки
18 KiB
C
552 строки
18 KiB
C
/*
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
#include "orte/types.h"
|
|
|
|
#include <string.h>
|
|
#include <fcntl.h>
|
|
|
|
#include "opal/dss/dss.h"
|
|
#include "opal/runtime/opal.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/threads/condition.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/ess/ess.h"
|
|
#include "orte/mca/odls/base/base.h"
|
|
#include "orte/mca/odls/odls_types.h"
|
|
#include "orte/mca/rmcast/rmcast.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
#include "orte/mca/rml/rml_types.h"
|
|
#include "orte/mca/routed/routed.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/util/nidmap.h"
|
|
#include "orte/orted/orted.h"
|
|
#include "orte/runtime/orte_wait.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
#include "orte/mca/rmcast/rmcast.h"
|
|
|
|
#include "orte/mca/grpcomm/base/base.h"
|
|
#include "grpcomm_mcast.h"
|
|
|
|
|
|
/* Static API's */
|
|
static int init(void);
|
|
static void finalize(void);
|
|
static int xcast(orte_jobid_t job,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag);
|
|
static int mcast_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf);
|
|
static int mcast_barrier(void);
|
|
static int modex(opal_list_t *procs);
|
|
static int get_proc_attr(const orte_process_name_t proc,
|
|
const char * attribute_name, void **val,
|
|
size_t *size);
|
|
|
|
/* Module def */
|
|
orte_grpcomm_base_module_t orte_grpcomm_mcast_module = {
|
|
init,
|
|
finalize,
|
|
xcast,
|
|
mcast_allgather,
|
|
orte_grpcomm_base_allgather_list,
|
|
mcast_barrier,
|
|
orte_grpcomm_base_set_proc_attr,
|
|
get_proc_attr,
|
|
modex,
|
|
orte_grpcomm_base_purge_proc_attrs
|
|
};
|
|
|
|
/* Local functions */
|
|
static void daemon_recv(int status,
|
|
orte_rmcast_channel_t channel,
|
|
orte_rmcast_tag_t tag,
|
|
orte_process_name_t *sender,
|
|
opal_buffer_t *buf, void* cbdata);
|
|
|
|
/* Local variables */
|
|
static orte_grpcomm_collective_t barrier, allgather;
|
|
|
|
/**
|
|
* Initialize the module
|
|
*/
|
|
static int init(void)
|
|
{
|
|
int rc;
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_init())) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
|
|
/* setup global variables */
|
|
OBJ_CONSTRUCT(&barrier, orte_grpcomm_collective_t);
|
|
OBJ_CONSTRUCT(&allgather, orte_grpcomm_collective_t);
|
|
|
|
/* point to our collective function */
|
|
orte_grpcomm_base.daemon_coll = orte_grpcomm_mcast_daemon_coll;
|
|
|
|
/* if we are a daemon or the hnp, we need to post a
|
|
* recv to catch any collective operations or cmds
|
|
*/
|
|
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
|
if (ORTE_SUCCESS != (rc = orte_rmcast.recv_buffer_nb(ORTE_RMCAST_SYS_CHANNEL,
|
|
ORTE_RMCAST_TAG_WILDCARD,
|
|
ORTE_RMCAST_PERSISTENT,
|
|
daemon_recv,
|
|
NULL))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* Finalize the module
|
|
*/
|
|
static void finalize(void)
|
|
{
|
|
orte_grpcomm_base_modex_finalize();
|
|
|
|
/* cancel the recv we posted */
|
|
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
|
orte_rmcast.cancel_recv(ORTE_RMCAST_SYS_CHANNEL, ORTE_RMCAST_TAG_WILDCARD);
|
|
}
|
|
|
|
/* destruct the globals */
|
|
OBJ_DESTRUCT(&barrier);
|
|
OBJ_DESTRUCT(&allgather);
|
|
}
|
|
|
|
/**
|
|
* A "broadcast-like" function to a job's processes.
|
|
* @param jobid The job whose processes are to receive the message
|
|
* @param buffer The data to broadcast
|
|
*/
|
|
static int xcast(orte_jobid_t job,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag)
|
|
{
|
|
int rc = ORTE_SUCCESS;
|
|
int32_t n;
|
|
opal_buffer_t buf;
|
|
orte_rml_tag_t rmltag;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:xcast sent to job %s tag %ld",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_JOBID_PRINT(job), (long)tag));
|
|
|
|
/* if there is no message to send, then just return ok */
|
|
if (NULL == buffer) {
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/* setup a buffer to handle the xcast command to an app */
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
/* insert the target tag */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &tag, 1, ORTE_RML_TAG_T))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* load the std data */
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_pack_xcast(ORTE_DAEMON_PROCESS_CMD,
|
|
job, &buf, buffer, tag))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* send it */
|
|
if (ORTE_SUCCESS != (rc = orte_rmcast.send_buffer(ORTE_RMCAST_SYS_CHANNEL,
|
|
ORTE_RMCAST_TAG_MSG, &buf))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
/* unpack the rml tag so the buffer is in the right place
|
|
* for processing
|
|
*/
|
|
n=1;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &rmltag, &n, ORTE_RML_TAG_T))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
/* multicast will not deliver it to myself, so do it manually */
|
|
ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
|
goto CLEANUP;
|
|
|
|
CLEANUP:
|
|
OBJ_DESTRUCT(&buf);
|
|
return rc;
|
|
}
|
|
|
|
|
|
static void barrier_recv(int status, orte_process_name_t* sender,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag, void *cbdata)
|
|
{
|
|
orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata;
|
|
|
|
OPAL_THREAD_LOCK(&coll->lock);
|
|
/* flag as recvd */
|
|
coll->recvd = 1;
|
|
opal_condition_broadcast(&coll->cond);
|
|
OPAL_THREAD_UNLOCK(&coll->lock);
|
|
}
|
|
|
|
static int mcast_barrier(void)
|
|
{
|
|
int rc;
|
|
opal_buffer_t buf;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast entering barrier",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* if I am alone, just return */
|
|
if (1 == orte_process_info.num_procs) {
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/* if I am a daemon, then multicast the barrier to
|
|
* all other daemons and wait to hear them all
|
|
*/
|
|
if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
|
|
OPAL_THREAD_LOCK(&barrier.lock);
|
|
barrier.recvd += 1; /* account for me */
|
|
OPAL_THREAD_UNLOCK(&barrier.lock);
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
/* send to everyone in my job */
|
|
if (ORTE_SUCCESS != (rc = xcast(ORTE_PROC_MY_NAME->jobid, &buf, ORTE_RML_TAG_XCAST_BARRIER))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
OBJ_DESTRUCT(&buf);
|
|
return rc;
|
|
}
|
|
OBJ_DESTRUCT(&buf);
|
|
/* wait to complete */
|
|
OPAL_THREAD_LOCK(&barrier.lock);
|
|
while (barrier.recvd < orte_process_info.num_procs) {
|
|
opal_condition_wait(&barrier.cond, &barrier.lock);
|
|
}
|
|
barrier.recvd = 0; /* reset for next time */
|
|
OPAL_THREAD_UNLOCK(&barrier.lock);
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast received barrier release",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
/* if I am an application process, then I must start by sending an RML
|
|
* message to my local daemon. I cannot just multicast to all other procs
|
|
* in my job as this barrier might be occurring during startup - and the
|
|
* other procs might not have started yet, and so will miss my message
|
|
*/
|
|
|
|
/* setup the recv to get the response */
|
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BARRIER,
|
|
ORTE_RML_NON_PERSISTENT, barrier_recv, &barrier);
|
|
if (rc != ORTE_SUCCESS) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
/* send it and wait for the response */
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_barrier(ORTE_PROC_MY_DAEMON, &barrier))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
/* don't need to cancel the recv as it only fires once */
|
|
|
|
OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast received barrier release",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void allgather_recv(int status, orte_process_name_t* sender,
|
|
opal_buffer_t *buffer,
|
|
orte_rml_tag_t tag, void *cbdata)
|
|
{
|
|
orte_grpcomm_collective_t *coll = (orte_grpcomm_collective_t*)cbdata;
|
|
int rc;
|
|
|
|
OPAL_THREAD_LOCK(&coll->lock);
|
|
/* xfer the data */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll->results, buffer))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
/* the daemon returns ALL of our recipients in a single message */
|
|
coll->recvd = orte_process_info.num_procs;
|
|
opal_condition_broadcast(&coll->cond);
|
|
OPAL_THREAD_UNLOCK(&coll->lock);
|
|
}
|
|
|
|
static int mcast_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf)
|
|
{
|
|
int rc;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast entering allgather",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* setup to receive results */
|
|
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER,
|
|
ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather);
|
|
if (rc != ORTE_SUCCESS) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
/* everyone sends data to their local daemon and waits for response */
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_app_allgather(ORTE_PROC_MY_DAEMON,
|
|
&allgather, sbuf, rbuf))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return rc;
|
|
}
|
|
|
|
/* don't need to cancel the recv as it only fires once */
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast allgather completed",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
return rc;
|
|
}
|
|
|
|
/*** MODEX SECTION ***/
|
|
static int modex(opal_list_t *procs)
|
|
{
|
|
int rc=ORTE_SUCCESS;
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast: modex entered",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
/* if we were given a list of procs to modex with, then this is happening
|
|
* as part of a connect/accept operation. In this case, we -must- do the
|
|
* modex for two reasons:
|
|
*
|
|
* (a) the modex could involve procs from different mpiruns. In this case,
|
|
* there is no way for the two sets of procs to know which node the
|
|
* other procs are on, so we cannot use the profile_file to determine
|
|
* their contact info
|
|
*
|
|
* (b) in a comm_spawn, the parent job does not have a pidmap for the
|
|
* child job. Thus, it cannot know where the child procs are located,
|
|
* and cannot use the profile_file to determine their contact info
|
|
*
|
|
*/
|
|
if (NULL != procs) {
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs, false))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
if (OPAL_ENABLE_HETEROGENEOUS_SUPPORT) {
|
|
/* decide if we need to add the architecture to the modex. Check
|
|
* first to see if hetero is enabled - if not, then we clearly
|
|
* don't need to exchange arch's as they are all identical
|
|
*/
|
|
/* Case 1: If different apps in this job were built differently - e.g., some
|
|
* are built 32-bit while others are built 64-bit - then we need to modex
|
|
* regardless of any other consideration. The user is reqd to tell us via a
|
|
* cmd line option if this situation exists, which will result in an mca param
|
|
* being set for us, so all we need to do is check for the global boolean
|
|
* that corresponds to that param
|
|
*
|
|
* Case 2: the nodes are hetero, but the app binaries were built
|
|
* the same - i.e., either they are both 32-bit, or they are both 64-bit, but
|
|
* no mixing of the two. In this case, we include the info in the modex
|
|
*/
|
|
if (orte_hetero_apps || !orte_homogeneous_nodes) {
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast: modex is required",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_grpcomm_base_peer_modex(false))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/* no modex is required - see if the data was included in the launch message */
|
|
if (orte_send_profile) {
|
|
/* the info was provided in the nidmap - there is nothing more we have to do */
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast:modex using nidmap",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast: modex completed",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int get_proc_attr(const orte_process_name_t proc,
|
|
const char * attribute_name, void **val,
|
|
size_t *size)
|
|
{
|
|
orte_nid_t *nid;
|
|
opal_list_item_t *item;
|
|
orte_attr_t *attr;
|
|
|
|
/* find this proc's node in the nidmap */
|
|
if (NULL == (nid = orte_util_lookup_nid((orte_process_name_t*)&proc))) {
|
|
/* proc wasn't found - return error */
|
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast:get_proc_attr: no modex entry for proc %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&proc)));
|
|
return ORTE_ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
/* look for this attribute */
|
|
for (item = opal_list_get_first(&nid->attrs);
|
|
item != opal_list_get_end(&nid->attrs);
|
|
item = opal_list_get_next(item)) {
|
|
attr = (orte_attr_t*)item;
|
|
if (0 == strcmp(attr->name, attribute_name)) {
|
|
/* copy the data to the caller */
|
|
void *copy = malloc(attr->size);
|
|
|
|
if (copy == NULL) {
|
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
memcpy(copy, attr->bytes, attr->size);
|
|
*val = copy;
|
|
*size = attr->size;
|
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast:get_proc_attr: found %d bytes for attr %s on proc %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)attr->size,
|
|
attribute_name, ORTE_NAME_PRINT(&proc)));
|
|
return ORTE_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* get here if attribute isn't found */
|
|
OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output,
|
|
"%s grpcomm:mcast:get_proc_attr: no attr avail or zero byte size for proc %s attribute %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&proc), attribute_name));
|
|
*val = NULL;
|
|
*size = 0;
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static void daemon_recv(int status,
|
|
orte_rmcast_channel_t channel,
|
|
orte_rmcast_tag_t tag,
|
|
orte_process_name_t *sender,
|
|
opal_buffer_t *buf, void* cbdata)
|
|
{
|
|
int32_t n;
|
|
orte_rml_tag_t rmltag;
|
|
int rc;
|
|
|
|
/* unpack the rml tag */
|
|
n=1;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &rmltag, &n, ORTE_RML_TAG_T))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
return;
|
|
}
|
|
|
|
switch (tag) {
|
|
case ORTE_RML_TAG_DAEMON:
|
|
/* this is a cmd, so deliver it */
|
|
ORTE_MESSAGE_EVENT(sender, buf, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
|
|
break;
|
|
|
|
case ORTE_RML_TAG_BARRIER:
|
|
OPAL_THREAD_LOCK(&barrier.lock);
|
|
/* the recv is the trigger */
|
|
barrier.recvd = 1;
|
|
opal_condition_broadcast(&barrier.cond);
|
|
OPAL_THREAD_UNLOCK(&barrier.lock);
|
|
|
|
break;
|
|
|
|
case ORTE_RML_TAG_ALLGATHER:
|
|
OPAL_THREAD_LOCK(&allgather.lock);
|
|
allgather.recvd += 1;
|
|
/* xfer the data */
|
|
opal_dss.copy_payload(&allgather.results, buf);
|
|
/* check for completion */
|
|
if (orte_process_info.num_procs <= allgather.recvd) {
|
|
opal_condition_broadcast(&allgather.cond);
|
|
}
|
|
OPAL_THREAD_UNLOCK(&allgather.lock);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* this function gets called when the daemon has received input from all
|
|
* of its local procs
|
|
*/
|
|
void orte_grpcomm_mcast_daemon_coll(orte_process_name_t* sender, opal_buffer_t* buffer)
|
|
{
|
|
opal_buffer_t buf;
|
|
int32_t n;
|
|
orte_jobid_t jobid;
|
|
orte_rml_tag_t rmltag;
|
|
int rc;
|
|
|
|
/* we have to partially unpack the provided buffer so it can be
|
|
* reconstructed properly for use here
|
|
*/
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
/* unpack the jobid */
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &n, ORTE_JOBID))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* unpack the target tag */
|
|
n = 1;
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rmltag, &n, ORTE_RML_TAG))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
/* pack things in the proper order */
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto CLEANUP;
|
|
}
|
|
|
|
CLEANUP:
|
|
return;
|
|
}
|