When we get a report from an orted about its state, don't use the sender of
the message to update the structures, but instead use the information from the URI. The reason is that even the launch report messages can get routed. Deal with the orted_cmd_line in a single location. This commit was SVN r21513.
Этот коммит содержится в:
родитель
84a953a2a6
Коммит
8cb8f28d9d
@ -433,7 +433,8 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
|||||||
{
|
{
|
||||||
orte_message_event_t *mev = (orte_message_event_t*)data;
|
orte_message_event_t *mev = (orte_message_event_t*)data;
|
||||||
opal_buffer_t *buffer = mev->buffer;
|
opal_buffer_t *buffer = mev->buffer;
|
||||||
char *rml_uri;
|
orte_process_name_t peer;
|
||||||
|
char *rml_uri = NULL;
|
||||||
int rc, idx;
|
int rc, idx;
|
||||||
int32_t arch;
|
int32_t arch;
|
||||||
orte_node_t **nodes;
|
orte_node_t **nodes;
|
||||||
@ -442,19 +443,11 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
|||||||
int64_t setupsec, setupusec;
|
int64_t setupsec, setupusec;
|
||||||
int64_t startsec, startusec;
|
int64_t startsec, startusec;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
|
||||||
"%s plm:base:orted_report_launch from daemon %s",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&mev->sender)));
|
|
||||||
|
|
||||||
/* see if we need to timestamp this receipt */
|
/* see if we need to timestamp this receipt */
|
||||||
if (orte_timing) {
|
if (orte_timing) {
|
||||||
gettimeofday(&recvtime, NULL);
|
gettimeofday(&recvtime, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update state */
|
|
||||||
pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
|
||||||
|
|
||||||
/* unpack its contact info */
|
/* unpack its contact info */
|
||||||
idx = 1;
|
idx = 1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, OPAL_STRING))) {
|
||||||
@ -466,13 +459,26 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
|||||||
/* set the contact info into the hash table */
|
/* set the contact info into the hash table */
|
||||||
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
|
if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
free(rml_uri);
|
|
||||||
orted_failed_launch = true;
|
orted_failed_launch = true;
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
/* lookup and record this daemon's contact info */
|
|
||||||
pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri);
|
rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL );
|
||||||
free(rml_uri);
|
if( ORTE_SUCCESS != rc ) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
orted_failed_launch = true;
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
|
"%s plm:base:orted_report_launch from daemon %s via %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(&peer),
|
||||||
|
ORTE_NAME_PRINT(&mev->sender)));
|
||||||
|
|
||||||
|
/* update state and record for this daemon contact info */
|
||||||
|
pdatorted[peer.vpid]->state = ORTE_PROC_STATE_RUNNING;
|
||||||
|
pdatorted[peer.vpid]->rml_uri = rml_uri;
|
||||||
|
|
||||||
/* get the remote arch */
|
/* get the remote arch */
|
||||||
idx = 1;
|
idx = 1;
|
||||||
@ -555,31 +561,33 @@ static void process_orted_launch_report(int fd, short event, void *data)
|
|||||||
|
|
||||||
/* lookup the node */
|
/* lookup the node */
|
||||||
nodes = (orte_node_t**)orte_node_pool->addr;
|
nodes = (orte_node_t**)orte_node_pool->addr;
|
||||||
if (NULL == nodes[mev->sender.vpid]) {
|
if (NULL == nodes[peer.vpid]) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
orted_failed_launch = true;
|
orted_failed_launch = true;
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
/* store the arch */
|
/* store the arch */
|
||||||
nodes[mev->sender.vpid]->arch = arch;
|
nodes[peer.vpid]->arch = arch;
|
||||||
|
|
||||||
/* if a tree-launch is underway, send the cmd back */
|
/* if a tree-launch is underway, send the cmd back */
|
||||||
if (NULL != orte_tree_launch_cmd) {
|
if (NULL != orte_tree_launch_cmd) {
|
||||||
orte_rml.send_buffer(&mev->sender, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0);
|
orte_rml.send_buffer(&peer, orte_tree_launch_cmd, ORTE_RML_TAG_DAEMON, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
CLEANUP:
|
CLEANUP:
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:orted_report_launch %s for daemon %s at contact %s",
|
"%s plm:base:orted_report_launch %s for daemon %s (via %s) at contact %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
orted_failed_launch ? "failed" : "completed",
|
orted_failed_launch ? "failed" : "completed",
|
||||||
ORTE_NAME_PRINT(&mev->sender), pdatorted[mev->sender.vpid]->rml_uri));
|
ORTE_NAME_PRINT(&peer),
|
||||||
|
ORTE_NAME_PRINT(&mev->sender), pdatorted[peer.vpid]->rml_uri));
|
||||||
|
|
||||||
/* release the message */
|
/* release the message */
|
||||||
OBJ_RELEASE(mev);
|
OBJ_RELEASE(mev);
|
||||||
|
|
||||||
if (orted_failed_launch) {
|
if (orted_failed_launch) {
|
||||||
|
if( NULL != rml_uri ) free(rml_uri);
|
||||||
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
} else {
|
} else {
|
||||||
orted_num_callback++;
|
orted_num_callback++;
|
||||||
@ -1133,18 +1141,23 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
|
|||||||
* being sure to "purge" any that would cause problems
|
* being sure to "purge" any that would cause problems
|
||||||
* on backend nodes
|
* on backend nodes
|
||||||
*/
|
*/
|
||||||
if (ORTE_PROC_IS_HNP) {
|
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||||
cnt = opal_argv_count(orted_cmd_line);
|
cnt = opal_argv_count(orted_cmd_line);
|
||||||
for (i=0; i < cnt; i+=3) {
|
for (i=0; i < cnt; i+=3) {
|
||||||
/* if the specified option is more than one word, we don't
|
/* in the rsh environment, we can append multi-word arguments
|
||||||
* have a generic way of passing it as some environments ignore
|
* by enclosing them in quotes. Check for any multi-word
|
||||||
* any quotes we add, while others don't - so we ignore any
|
* mca params passed to mpirun and include them
|
||||||
* such options. In most cases, this won't be a problem as
|
*/
|
||||||
* they typically only apply to things of interest to the HNP.
|
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
|
||||||
* Individual environments can add these back into the cmd line
|
char* param;
|
||||||
* as they know if it can be supported
|
|
||||||
*/
|
/* must add quotes around it */
|
||||||
if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
|
asprintf(¶m, "\"%s\"", orted_cmd_line[i+2]);
|
||||||
|
/* now pass it along */
|
||||||
|
opal_argv_append(argc, argv, orted_cmd_line[i]);
|
||||||
|
opal_argv_append(argc, argv, orted_cmd_line[i+1]);
|
||||||
|
opal_argv_append(argc, argv, param);
|
||||||
|
free(param);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* The daemon will attempt to open the PLM on the remote
|
/* The daemon will attempt to open the PLM on the remote
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user