1
1

Ensure daemons know contact info for all other daemons. Update binomial xcast to work in revised design. Add debug output to orted so the daemon lets us know it launched (if --debug-daemons set) early on in case it fails during orte_init

This commit was SVN r15555.
Этот коммит содержится в:
Ralph Castain 2007-07-23 15:00:39 +00:00
родитель 6c800d452d
Коммит ef141d1fbc
3 изменённых файлов: 60 добавлений и 53 удалений

Просмотреть файл

@ -339,63 +339,24 @@ static int xcast_binomial_tree(orte_jobid_t job,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)buf->bytes_used);
}
/* start setting up the target recipients */
/* all we need to do is send this to ourselves - our relay logic
* will ensure everyone else gets it!
*/
target.jobid = 0;
target.vpid = 0;
++orte_grpcomm_basic.num_active;
/* compute the bitmap */
bitmap = opal_cube_dim((int)num_daemons);
rank = 0;
size = (int)num_daemons;
hibit = opal_hibit(rank, bitmap);
--bitmap;
/* we have to account for all of the messages we are about to send
* because the non-blocking send can come back almost immediately - before
* we would get the chance to increment the num_active. This causes us
* to not correctly wakeup and reset the xcast_in_progress flag
*/
OPAL_THREAD_LOCK(&orte_grpcomm_basic.mutex);
/* compute the number of sends we are going to do - it would be nice
* to have a simple algo to do this, but for now just brute force
* is fine
*/
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
peer = rank | mask;
if (peer < size) {
++orte_grpcomm_basic.num_active;
}
}
if (orte_grpcomm_basic.num_active == 0) {
opal_output(orte_grpcomm_basic.output, "%s xcast to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&target));
if (0 > (rc = orte_rml.send_buffer_nb(&target, buf, ORTE_RML_TAG_ORTED_ROUTED,
0, xcast_send_cb, NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
rc = ORTE_ERR_COMM_FAILURE;
OPAL_THREAD_LOCK(&orte_grpcomm_basic.mutex);
--orte_grpcomm_basic.num_active;
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
rc = ORTE_SUCCESS;
goto CLEANUP;
}
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
target.jobid = 0;
for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
peer = rank | mask;
if (peer < size) {
target.vpid = (orte_vpid_t)peer;
opal_output(orte_grpcomm_basic.output, "%s xcast to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&target));
if (0 > (rc = orte_rml.send_buffer_nb(&target, buf, ORTE_RML_TAG_ORTED_ROUTED,
0, xcast_send_cb, NULL))) {
if (ORTE_ERR_ADDRESSEE_UNKNOWN != rc) {
ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
rc = ORTE_ERR_COMM_FAILURE;
OPAL_THREAD_LOCK(&orte_grpcomm_basic.mutex);
orte_grpcomm_basic.num_active -= (num_daemons-i);
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
goto CLEANUP;
}
/* decrement the number we are waiting to see */
OPAL_THREAD_LOCK(&orte_grpcomm_basic.mutex);
orte_grpcomm_basic.num_active--;
OPAL_THREAD_UNLOCK(&orte_grpcomm_basic.mutex);
}
}
}
CLEANUP:
OBJ_RELEASE(buf);

Просмотреть файл

@ -27,6 +27,7 @@
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/odls/odls.h"
@ -78,6 +79,9 @@ int orte_pls_base_daemon_callback(orte_std_cntr_t num_daemons)
orte_process_name_t name;
int src[4];
int rc, idx;
orte_buffer_t *buf;
orte_gpr_notify_data_t *data=NULL;
orte_rml_cmd_flag_t command;
for(i = 0; i < num_daemons; i++) {
OBJ_CONSTRUCT(&ack, orte_buffer_t);
@ -145,5 +149,38 @@ int orte_pls_base_daemon_callback(orte_std_cntr_t num_daemons)
OBJ_DESTRUCT(&handoff); /* done with this */
}
return ORTE_SUCCESS;
/* all done launching - update everyone's contact info so all daemons
* can talk to each other
*/
name.jobid = 0;
name.vpid = ORTE_VPID_WILDCARD;
orte_rml_base_get_contact_info(&name, &data);
if (NULL != data) {
buf = OBJ_NEW(orte_buffer_t);
/* pack the update-RML command */
command = ORTE_RML_UPDATE_CMD;
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &command, 1, ORTE_RML_CMD))) {
ORTE_ERROR_LOG(rc);
}
/* pack the data for xmission */
if (ORTE_SUCCESS != (rc = orte_dss.pack(buf, &data, 1, ORTE_GPR_NOTIFY_DATA))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(data);
return rc;
}
/* now send it */
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(0, buf, ORTE_RML_TAG_RML_INFO_UPDATE))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buf);
OBJ_RELEASE(data);
return rc;
}
/* done with the buffer */
OBJ_RELEASE(buf);
/* cleanup the data */
OBJ_RELEASE(data);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -222,6 +222,7 @@ int orte_daemon(int argc, char *argv[])
int i;
orte_buffer_t *buffer;
int zero = 0;
char hostname[100];
/* initialize the globals */
memset(&orted_globals, 0, sizeof(orted_globals));
@ -273,6 +274,14 @@ int orte_daemon(int argc, char *argv[])
return ret;
}
/* if orte_daemon_debug is set, let someone know we are alive right
* away just in case we have a problem along the way
*/
if (orte_debug_daemons_flag) {
gethostname(hostname, 100);
fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
}
/* check for help request */
if (orted_globals.help) {
char *args = NULL;