1
1

Turn off tree spawn as it doesn't currently work - will fix shortly. Add topology collection

This commit was SVN r25472.
Этот коммит содержится в:
Ralph Castain 2011-11-11 23:42:36 +00:00
родитель d008aeb531
Коммит 6b5e1b89cf
2 изменённых файлов: 134 добавлений и 5 удалений

Просмотреть файл

@ -150,11 +150,14 @@ int orte_plm_rsh_component_open(void)
"Delay (in seconds) between invocations of the remote agent, but only used when the \"debug\" MCA parameter is true, or the top-level MCA debugging is enabled (otherwise this value is ignored)",
false, false, 1,
&mca_plm_rsh_component.delay);
#if 0
/* NEEDS TO BE FIXED */
mca_base_param_reg_int(c, "tree_spawn",
"If set to 1, launch via a tree-based topology",
false, false, (int)false, &tmp);
mca_plm_rsh_component.tree_spawn = OPAL_INT_TO_BOOL(tmp);
#endif
mca_plm_rsh_component.tree_spawn = false;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -63,7 +63,6 @@
#include "opal/mca/installdirs/installdirs.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "opal/util/opal_sos.h"
#include "opal/mca/event/event.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
@ -975,6 +974,9 @@ cleanup:
static int orted_num_callback = 0;
static bool orted_failed_launch = false;
static orte_job_t *jdatorted;
static struct timeval daemonlaunchtime = {0,0}, daemonsetuptime = {0,0}, daemoncbtime = {0,0};
static void
plm_rsh_report_orted_launch(int status, orte_process_name_t* sender,
opal_buffer_t *buffer,
@ -984,7 +986,11 @@ plm_rsh_report_orted_launch(int status, orte_process_name_t* sender,
char *rml_uri = NULL;
int rc, idx;
orte_proc_t *daemon=NULL;
orte_job_t *jdatorted;
struct timeval recvtime;
long secs, usecs;
int64_t setupsec, setupusec;
int64_t startsec, startusec;
char *nodename;
orted_failed_launch = true;
/* unpack its contact info */
@ -1032,9 +1038,129 @@ plm_rsh_report_orted_launch(int status, orte_process_name_t* sender,
/* if we are doing a timing test, unload the start and setup times of the daemon */
if (orte_timing) {
/* Deal with the timing if this information is considered useful */
/* get the time stamp when the daemon first started */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &startusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* save the latest daemon to start */
if (startsec > daemonlaunchtime.tv_sec) {
daemonlaunchtime.tv_sec = startsec;
daemonlaunchtime.tv_usec = startusec;
} else if (startsec == daemonlaunchtime.tv_sec &&
startusec > daemonlaunchtime.tv_usec) {
daemonlaunchtime.tv_usec = startusec;
}
/* get the time required for the daemon to setup - locally computed by each daemon */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* save the longest */
if (setupsec > daemonsetuptime.tv_sec) {
daemonsetuptime.tv_sec = setupsec;
daemonsetuptime.tv_usec = setupusec;
} else if (setupsec == daemonsetuptime.tv_sec &&
setupusec > daemonsetuptime.tv_usec) {
daemonsetuptime.tv_usec = setupusec;
}
/* get the time stamp of when the daemon started to send this message to us */
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupsec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &setupusec, &idx, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
/* check the time for the callback to complete and save the longest */
ORTE_COMPUTE_TIME_DIFF(secs, usecs, setupsec, setupusec, recvtime.tv_sec, recvtime.tv_usec);
if (secs > daemoncbtime.tv_sec) {
daemoncbtime.tv_sec = secs;
daemoncbtime.tv_usec = usecs;
} else if (secs == daemoncbtime.tv_sec &&
usecs > daemoncbtime.tv_usec) {
daemoncbtime.tv_usec = usecs;
}
}
/* unpack the node name - we don't need it here, but it is included
* in the message for other uses
*/
idx = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
orted_failed_launch = true;
goto CLEANUP;
}
#if OPAL_HAVE_HWLOC
/* store the local resources for that node */
{
hwloc_topology_t topo, t;
orte_node_t *node;
int i;
bool found;
idx=1;
node = daemon->node;
if (OPAL_SUCCESS == opal_dss.unpack(buffer, &topo, &idx, OPAL_HWLOC_TOPO)) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s RECEIVED TOPOLOGY FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodename));
if (10 < opal_output_get_verbosity(orte_plm_globals.output)) {
opal_dss.dump(0, topo, OPAL_HWLOC_TOPO);
}
/* do we already have this topology from some other node? */
found = false;
for (i=0; i < orte_node_topologies->size; i++) {
if (NULL == (t = (hwloc_topology_t)opal_pointer_array_get_item(orte_node_topologies, i))) {
continue;
}
if (OPAL_EQUAL == opal_dss.compare(topo, t, OPAL_HWLOC_TOPO)) {
/* yes - just point to it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s TOPOLOGY MATCHES - DISCARDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
found = true;
node->topology = t;
hwloc_topology_destroy(topo);
break;
}
}
if (!found) {
/* nope - add it */
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s NEW TOPOLOGY - ADDING",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
opal_pointer_array_add(orte_node_topologies, topo);
node->topology = topo;
}
}
}
#endif
/* if a tree-launch is underway, send the cmd back */
if (NULL != orte_tree_launch_cmd) {
orte_rml.send_buffer(&peer, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0);
@ -1272,7 +1398,7 @@ int orte_plm_rsh_launch(orte_job_t *jdata)
orted_failed_launch = false;
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
ORTE_RML_PERSISTENT, plm_rsh_report_orted_launch, NULL);
if (rc != ORTE_SUCCESS && OPAL_SOS_GET_ERROR_CODE(rc) != ORTE_ERR_NOT_IMPLEMENTED) {
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
return rc;
}