2008-02-28 04:57:57 +03:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/types.h"
|
|
|
|
#include "orte/constants.h"
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
2009-03-18 01:37:15 +03:00
|
|
|
#include "opal/util/output.h"
|
2010-05-18 03:08:56 +04:00
|
|
|
#include "opal/util/opal_sos.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "opal/threads/tsd.h"
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
#include "opal/mca/event/event.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
#include "opal/dss/dss.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
#include "orte/mca/odls/odls_types.h"
|
|
|
|
#include "orte/mca/rml/rml.h"
|
2009-02-14 05:26:12 +03:00
|
|
|
#include "orte/mca/rml/rml_types.h"
|
2009-09-09 09:28:45 +04:00
|
|
|
#include "orte/mca/rml/base/rml_contact.h"
|
|
|
|
#include "orte/mca/routed/routed.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/util/name_fns.h"
|
|
|
|
#include "orte/runtime/orte_globals.h"
|
2009-05-15 17:21:18 +04:00
|
|
|
#include "orte/runtime/orte_wait.h"
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
#include "orte/util/comm/comm.h"
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* internal communication handshake */
|
2009-05-15 17:21:18 +04:00
|
|
|
/* quick timeout loop */
|
|
|
|
static bool timer_fired;
|
|
|
|
static opal_buffer_t answer;
|
|
|
|
static opal_event_t *quicktime=NULL;
|
|
|
|
static int error_exit;
|
|
|
|
|
|
|
|
static void quicktime_cb(int fd, short event, void *cbdata)
|
|
|
|
{
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-15 17:21:18 +04:00
|
|
|
}
|
|
|
|
error_exit = ORTE_ERR_SILENT;
|
|
|
|
/* declare it fired */
|
|
|
|
timer_fired = true;
|
|
|
|
}
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
static void send_cbfunc(int status, orte_process_name_t* sender,
|
|
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
2009-05-15 17:21:18 +04:00
|
|
|
{
|
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-15 17:21:18 +04:00
|
|
|
}
|
|
|
|
/* declare the work done */
|
|
|
|
timer_fired = true;
|
|
|
|
}
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
static void recv_info(int status, orte_process_name_t* sender,
|
2009-05-21 06:42:21 +04:00
|
|
|
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
|
|
|
void* cbdata)
|
|
|
|
{
|
2009-09-09 09:28:45 +04:00
|
|
|
int rc;
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-21 06:42:21 +04:00
|
|
|
}
|
2009-09-09 09:28:45 +04:00
|
|
|
/* xfer the answer */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&answer, buffer))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
/* declare the work done */
|
|
|
|
timer_fired = true;
|
|
|
|
}
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
|
|
|
|
/* name of attached tool */
|
|
|
|
static orte_process_name_t tool;
|
|
|
|
static bool tool_connected = false;
|
|
|
|
|
|
|
|
/* connect a tool to us so we can send reports */
|
|
|
|
int orte_util_comm_connect_tool(char *uri)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/* set the contact info into the comm hash tables*/
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(uri))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* extract the tool's name and store it */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(uri, &tool, NULL))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set the route to be direct */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_routed.update_route(&tool, &tool))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
tool_connected = true;
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* whether we are in step mode */
|
|
|
|
static bool step=false;
|
|
|
|
|
|
|
|
/* report an event to a connected tool */
|
|
|
|
int orte_util_comm_report_event(orte_comm_event_t ev)
|
|
|
|
{
|
|
|
|
int rc, i;
|
|
|
|
opal_buffer_t buf;
|
|
|
|
orte_node_t *node;
|
|
|
|
|
|
|
|
/* if nothing is connected, ignore this */
|
|
|
|
if (!tool_connected) {
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* init a buffer for the data */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
/* flag the type of event */
|
|
|
|
opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT);
|
|
|
|
|
|
|
|
switch (ev) {
|
|
|
|
case ORTE_COMM_EVENT_ALLOCATE:
|
|
|
|
/* loop through nodes, storing just node names */
|
|
|
|
for (i=0; i < orte_node_pool->size; i++) {
|
|
|
|
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
opal_dss.pack(&buf, &node->name, 1, OPAL_STRING);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ORTE_COMM_EVENT_MAP:
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ORTE_COMM_EVENT_LAUNCH:
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
return ORTE_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* do the send */
|
|
|
|
if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (step) {
|
|
|
|
/* the caller wants to wait until an ack is received -
|
|
|
|
* define a max time to wait for an answer
|
|
|
|
*/
|
|
|
|
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
|
|
|
|
|
|
|
/* get the answer */
|
|
|
|
if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
|
|
ORTE_RML_TAG_TOOL,
|
|
|
|
ORTE_RML_NON_PERSISTENT,
|
|
|
|
recv_info,
|
|
|
|
NULL))) {
|
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-09-09 09:28:45 +04:00
|
|
|
}
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
|
|
|
/* cleanup */
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
int orte_util_comm_query_job_info(const orte_process_name_t *hnp, orte_jobid_t job,
|
|
|
|
int *num_jobs, orte_job_t ***job_info_array)
|
|
|
|
{
|
|
|
|
int ret;
|
2009-05-15 17:21:18 +04:00
|
|
|
int32_t cnt, cnt_jobs, n;
|
2009-05-21 06:42:21 +04:00
|
|
|
opal_buffer_t *cmd;
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_JOB_INFO_CMD;
|
|
|
|
orte_job_t **job_info;
|
2009-05-15 17:21:18 +04:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* set default response */
|
|
|
|
*num_jobs = 0;
|
|
|
|
*job_info_array = NULL;
|
|
|
|
|
|
|
|
/* send query to HNP */
|
2009-05-21 06:42:21 +04:00
|
|
|
cmd = OBJ_NEW(opal_buffer_t);
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
/* define a max time to wait for send to complete */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
|
|
|
|
|
|
|
/* do the send */
|
|
|
|
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
|
|
|
send_cbfunc, NULL))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
|
|
|
|
/* wait for send to complete */
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* release the buffer */
|
|
|
|
OBJ_RELEASE(cmd);
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* did it succeed? */
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
return error_exit;
|
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2009-05-15 17:21:18 +04:00
|
|
|
/* setup for answer */
|
2008-02-28 04:57:57 +03:00
|
|
|
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
2009-05-15 17:21:18 +04:00
|
|
|
|
|
|
|
/* define a max time to wait for an answer */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
2009-05-21 06:42:21 +04:00
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
2009-05-15 17:21:18 +04:00
|
|
|
|
|
|
|
/* get the answer */
|
|
|
|
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
|
|
ORTE_RML_TAG_TOOL,
|
|
|
|
ORTE_RML_NON_PERSISTENT,
|
|
|
|
recv_info,
|
|
|
|
NULL))) {
|
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-15 17:21:18 +04:00
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return ret;
|
|
|
|
}
|
2009-05-15 17:21:18 +04:00
|
|
|
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
cnt = 1;
|
2009-05-15 17:21:18 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_jobs, &cnt, OPAL_INT32))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate the required memory */
|
|
|
|
if (0 < cnt_jobs) {
|
|
|
|
job_info = (orte_job_t**)malloc(cnt_jobs * sizeof(orte_job_t*));
|
|
|
|
/* unpack the job data */
|
2009-03-03 16:38:29 +03:00
|
|
|
for (n=0; n < cnt_jobs; n++) {
|
|
|
|
cnt = 1;
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &job_info[n], &cnt, ORTE_JOB))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
free(job_info);
|
|
|
|
return ret;
|
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
}
|
|
|
|
*job_info_array = job_info;
|
|
|
|
*num_jobs = cnt_jobs;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2008-04-30 23:49:53 +04:00
|
|
|
int orte_util_comm_query_node_info(const orte_process_name_t *hnp, char *node,
|
2008-02-28 04:57:57 +03:00
|
|
|
int *num_nodes, orte_node_t ***node_info_array)
|
|
|
|
{
|
|
|
|
int ret;
|
2009-05-15 17:21:18 +04:00
|
|
|
int32_t cnt, cnt_nodes, n;
|
2009-05-21 06:42:21 +04:00
|
|
|
opal_buffer_t *cmd;
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_NODE_INFO_CMD;
|
|
|
|
orte_node_t **node_info;
|
|
|
|
|
|
|
|
/* set default response */
|
|
|
|
*num_nodes = 0;
|
|
|
|
*node_info_array = NULL;
|
|
|
|
|
|
|
|
/* query the HNP for node info */
|
2009-05-21 06:42:21 +04:00
|
|
|
cmd = OBJ_NEW(opal_buffer_t);
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &node, 1, OPAL_STRING))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
/* define a max time to wait for send to complete */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
|
|
|
|
|
|
|
/* do the send */
|
|
|
|
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
|
|
|
send_cbfunc, NULL))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* wait for send to complete */
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* release the buffer */
|
|
|
|
OBJ_RELEASE(cmd);
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* did it succeed? */
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
|
2009-05-15 17:21:18 +04:00
|
|
|
/* define a max time to wait for an answer */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* get the answer */
|
|
|
|
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
2009-05-15 17:21:18 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
|
|
ORTE_RML_TAG_TOOL,
|
|
|
|
ORTE_RML_NON_PERSISTENT,
|
|
|
|
recv_info,
|
|
|
|
NULL))) {
|
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-15 17:21:18 +04:00
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-16 08:15:55 +04:00
|
|
|
OBJ_DESTRUCT(&answer);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-15 17:21:18 +04:00
|
|
|
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
cnt = 1;
|
2009-05-15 17:21:18 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_nodes, &cnt, OPAL_INT32))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate the required memory */
|
|
|
|
if (0 < cnt_nodes) {
|
|
|
|
node_info = (orte_node_t**)malloc(cnt_nodes * sizeof(orte_node_t*));
|
|
|
|
/* unpack the node data */
|
2009-05-15 17:21:18 +04:00
|
|
|
for (n=0; n < cnt_nodes; n++) {
|
|
|
|
cnt = 1;
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &node_info[n], &cnt, ORTE_NODE))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
free(node_info);
|
|
|
|
return ret;
|
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
}
|
|
|
|
*node_info_array = node_info;
|
|
|
|
*num_nodes = cnt_nodes;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid,
|
|
|
|
int *num_procs, orte_proc_t ***proc_info_array)
|
|
|
|
{
|
|
|
|
int ret;
|
2009-05-15 17:21:18 +04:00
|
|
|
int32_t cnt, cnt_procs, n;
|
2009-05-21 06:42:21 +04:00
|
|
|
opal_buffer_t *cmd;
|
2008-02-28 04:57:57 +03:00
|
|
|
orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD;
|
|
|
|
orte_proc_t **proc_info;
|
|
|
|
|
|
|
|
/* set default response */
|
|
|
|
*num_procs = 0;
|
|
|
|
*proc_info_array = NULL;
|
|
|
|
|
|
|
|
/* query the HNP for info on the procs in this job */
|
2009-05-21 06:42:21 +04:00
|
|
|
cmd = OBJ_NEW(opal_buffer_t);
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-21 06:42:21 +04:00
|
|
|
/* define a max time to wait for send to complete */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb);
|
|
|
|
|
|
|
|
/* do the send */
|
|
|
|
if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0,
|
|
|
|
send_cbfunc, NULL))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-21 06:42:21 +04:00
|
|
|
OBJ_RELEASE(cmd);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* wait for send to complete */
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
2009-09-09 09:28:45 +04:00
|
|
|
/* release the buffer */
|
|
|
|
OBJ_RELEASE(cmd);
|
|
|
|
|
2009-05-21 06:42:21 +04:00
|
|
|
/* did it succeed? */
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
|
2009-05-15 17:21:18 +04:00
|
|
|
/* define a max time to wait for an answer */
|
|
|
|
timer_fired = false;
|
|
|
|
error_exit = ORTE_SUCCESS;
|
|
|
|
ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb);
|
|
|
|
|
|
|
|
/* get the answer */
|
2008-02-28 04:57:57 +03:00
|
|
|
OBJ_CONSTRUCT(&answer, opal_buffer_t);
|
2009-05-15 17:21:18 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
|
|
|
ORTE_RML_TAG_TOOL,
|
|
|
|
ORTE_RML_NON_PERSISTENT,
|
|
|
|
recv_info,
|
|
|
|
NULL))) {
|
|
|
|
/* cancel the timer */
|
|
|
|
if (NULL != quicktime) {
|
Update libevent to the 2.0 series, currently at 2.0.7rc. We will update to their final release when it becomes available. Currently known errors exist in unused portions of the libevent code. This revision passes the IBM test suite on a Linux machine and on a standalone Mac.
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects.
Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems.
Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct.
I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things:
1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new)
2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it.
There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do.
This commit was SVN r23925.
2010-10-24 22:35:54 +04:00
|
|
|
opal_event.evtimer_del(quicktime);
|
|
|
|
OBJ_RELEASE(quicktime);
|
2009-05-15 17:21:18 +04:00
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
2009-05-16 08:15:55 +04:00
|
|
|
OBJ_DESTRUCT(&answer);
|
2008-02-28 04:57:57 +03:00
|
|
|
return ret;
|
|
|
|
}
|
2009-05-15 17:21:18 +04:00
|
|
|
|
|
|
|
ORTE_PROGRESSED_WAIT(timer_fired, 0, 1);
|
|
|
|
|
|
|
|
if (ORTE_SUCCESS != error_exit) {
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return error_exit;
|
|
|
|
}
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
cnt = 1;
|
2009-05-15 17:21:18 +04:00
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) {
|
2008-02-28 04:57:57 +03:00
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
return ret;
|
|
|
|
}
|
2010-10-25 21:53:53 +04:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/* allocate the required memory */
|
|
|
|
if (0 < cnt_procs) {
|
|
|
|
proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*));
|
|
|
|
/* unpack the procs */
|
2009-05-15 17:21:18 +04:00
|
|
|
for (n=0; n < cnt_procs; n++) {
|
|
|
|
cnt = 1;
|
|
|
|
if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) {
|
|
|
|
ORTE_ERROR_LOG(ret);
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
free(proc_info);
|
|
|
|
return ret;
|
|
|
|
}
|
2008-02-28 04:57:57 +03:00
|
|
|
}
|
|
|
|
*proc_info_array = proc_info;
|
|
|
|
*num_procs = (int)cnt_procs;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&answer);
|
|
|
|
|
|
|
|
return ORTE_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The spawn function cannot just call the plm.proxy since that won't
|
|
|
|
* necessarily be open. Likewise, we can't just send the launch request
|
|
|
|
* to the HNP's plm_receive as that function would return the response
|
|
|
|
* to the plm_proxy tag! So we have to go another route to get this
|
|
|
|
* request processed
|
|
|
|
*/
|
|
|
|
int orte_util_comm_spawn_job(const orte_process_name_t *hnp, orte_job_t *jdata)
|
|
|
|
{
|
|
|
|
opal_buffer_t buf;
|
|
|
|
orte_daemon_cmd_flag_t command;
|
|
|
|
orte_std_cntr_t count;
|
|
|
|
int rc;
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_spawn_job: requesting HNP %s spawn new job",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(hnp)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* setup the buffer */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
|
|
|
|
/* tell the HNP we are sending a launch request */
|
|
|
|
command = ORTE_DAEMON_SPAWN_JOB_CMD;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pack the jdata object */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_spawn_job: sending spawn cmd to HNP %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(hnp)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* tell the target HNP to launch the job */
|
|
|
|
if (0 > (rc = orte_rml.send_buffer((orte_process_name_t*)hnp, &buf, ORTE_RML_TAG_DAEMON, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_spawn_job: waiting for response",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* wait for the target's response */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_TOOL, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the new jobid back in case the caller wants it */
|
|
|
|
count = 1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(jdata->jobid), &count, ORTE_JOBID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
if (ORTE_JOBID_INVALID == jdata->jobid) {
|
|
|
|
/* something went wrong on far end - go no further */
|
|
|
|
rc = ORTE_ERR_FAILED_TO_START;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* good to go! */
|
|
|
|
|
|
|
|
CLEANUP:
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int orte_util_comm_terminate_job(const orte_process_name_t *hnp, orte_jobid_t job)
|
|
|
|
{
|
|
|
|
opal_buffer_t buf;
|
|
|
|
orte_daemon_cmd_flag_t command;
|
|
|
|
orte_std_cntr_t count;
|
|
|
|
int rc, ret = ORTE_ERROR;
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_spawn_job: requesting HNP %s terminate job %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(hnp),
|
|
|
|
ORTE_JOBID_PRINT(job)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* setup the buffer */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
|
|
|
|
/* tell the HNP we are sending a terminate request */
|
|
|
|
command = ORTE_DAEMON_TERMINATE_JOB_CMD;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
ret = rc;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pack the jobid */
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &job, 1, ORTE_JOBID))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
ret = rc;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_spawn_job: sending terminate cmd to HNP %s",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(hnp)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* tell the target HNP to terminate the job */
|
|
|
|
if (0 > (rc = orte_rml.send_buffer((orte_process_name_t*)hnp, &buf, ORTE_RML_TAG_DAEMON, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
ret = rc;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
2008-02-28 04:57:57 +03:00
|
|
|
"%s util_comm_terminate_job: waiting for response",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
2008-02-28 04:57:57 +03:00
|
|
|
|
|
|
|
/* wait for the target's response */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_TOOL, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
ret = rc;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the status code */
|
|
|
|
count = 1;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &ret, &count, OPAL_INT))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
ret = rc;
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
CLEANUP:
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 20:58:59 +03:00
|
|
|
int orte_util_comm_halt_vm(const orte_process_name_t *hnp)
|
|
|
|
{
|
|
|
|
opal_buffer_t buf;
|
|
|
|
orte_daemon_cmd_flag_t command;
|
|
|
|
int rc;
|
|
|
|
|
2008-06-09 18:53:58 +04:00
|
|
|
OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
|
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 20:58:59 +03:00
|
|
|
"%s util_comm_halt_vm: ordering HNP %s terminate",
|
2009-03-06 00:50:47 +03:00
|
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
|
|
ORTE_NAME_PRINT(hnp)));
|
Afraid this has a couple of things mixed into the commit. Couldn't be helped - had missed one commit prior to running out the door on vacation.
Fix race conditions in abnormal terminations. We had done a first-cut at this in a prior commit. However, the window remained partially open due to the fact that the HNP has multiple paths leading to orte_finalize. Most of our frameworks don't care if they are finalized more than once, but one of them does, which meant we segfaulted if orte_finalize got called more than once. Besides, we really shouldn't be doing that anyway.
So we now introduce a set of atomic locks that prevent us from multiply calling abort, attempting to call orte_finalize, etc. My initial tests indicate this is working cleanly, but since it is a race condition issue, more testing will have to be done before we know for sure that this problem has been licked.
Also, some updates relevant to the tool comm library snuck in here. Since those also touched the orted code (as did the prior changes), I didn't want to attempt to separate them out - besides, they are coming in soon anyway. More on them later as that functionality approaches completion.
This commit was SVN r17843.
2008-03-17 20:58:59 +03:00
|
|
|
|
|
|
|
/* setup the buffer */
|
|
|
|
OBJ_CONSTRUCT(&buf, opal_buffer_t);
|
|
|
|
|
|
|
|
/* tell the HNP to die */
|
|
|
|
command = ORTE_DAEMON_HALT_VM_CMD;
|
|
|
|
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_DAEMON_CMD))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* send the order */
|
|
|
|
if (0 > (rc = orte_rml.send_buffer((orte_process_name_t*)hnp, &buf, ORTE_RML_TAG_DAEMON, 0))) {
|
|
|
|
ORTE_ERROR_LOG(rc);
|
|
|
|
goto CLEANUP;
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
/* don't bother waiting around */
|
|
|
|
CLEANUP:
|
|
|
|
OBJ_DESTRUCT(&buf);
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|