1
1

Update reuse_daemons to find the daemons again - requires that orteds now report their nodenames (probably temporary patch pending upcoming minor revision of orted)

This commit was SVN r14533.
Этот коммит содержится в:
Ralph Castain 2007-04-26 15:09:54 +00:00
родитель c733a7916b
Коммит 7d6d0a1c00
2 изменённых файлов: 77 добавлений и 41 удалений

Просмотреть файл

@ -27,7 +27,7 @@
#include "orte/dss/dss.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/gpr/gpr_types.h"
#include "orte/mca/gpr/gpr.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ns/ns_types.h"
@ -35,49 +35,61 @@
int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
{
opal_list_t avail_daemons;
opal_list_item_t *item, *item2, *next;
orte_pls_daemon_info_t *dmn, *newdmn;
orte_gpr_value_t **values; /* the gpr initializes this to NULL */
orte_gpr_keyval_t *kv;
orte_std_cntr_t cnt, i;
char *keys[] = {
ORTE_NODE_NAME_KEY,
NULL
};
opal_list_item_t *item2, *next;
orte_mapped_node_t *node;
opal_list_t used_daemons;
orte_gpr_notify_data_t *ndat;
bool found;
char *nodename;
int rc;
OBJ_CONSTRUCT(&avail_daemons, opal_list_t);
OBJ_CONSTRUCT(&used_daemons, opal_list_t);
/* check for available daemons we could use */
if (ORTE_SUCCESS != (rc = orte_pls_base_check_avail_daemons(&avail_daemons, map->job))) {
/* query the daemon info */
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
"orte-job-0", /* the daemon job segment */
NULL, /* all containers */
keys,
&cnt, &values))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* go through the list, checking nodenames against what is in the
/* if no daemons are around (except HNP), then don't worry about this */
if (cnt <= 1) {
rc = ORTE_SUCCESS;
goto CLEANUP;
}
/* get here if some daemons, other than HNP, exist
* go through the list, checking nodenames against what is in the
* map. If nodes match, then construct and send an appropriate command
* to that daemon to launch the local procs - remove that node structure
* from the map so that the main launcher doesn't also try to start procs
* on that node!
*/
found = false;
while (NULL != (item = opal_list_remove_first(&avail_daemons))) {
dmn = (orte_pls_daemon_info_t*)item;
item2 = opal_list_get_first(&map->nodes);
while (item2 != opal_list_get_end(&map->nodes)) {
node = (orte_mapped_node_t*)item2;
item2 = opal_list_get_first(&map->nodes);
while (item2 != opal_list_get_end(&map->nodes)) {
node = (orte_mapped_node_t*)item2;
/* save the next position in case we remove this one */
next = opal_list_get_next(item2);
/* check the returned values and see if the nodenames match */
for (i=0; i < cnt; i++) {
kv = values[i]->keyvals[0];
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nodename, kv->value, ORTE_STRING))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
/* save the next position in case we remove this one */
next = opal_list_get_next(item2);
if (0 == strcmp(node->nodename, dmn->nodename)) {
newdmn = OBJ_NEW(orte_pls_daemon_info_t);
newdmn->cell = dmn->cell;
newdmn->nodename = strdup(dmn->nodename);
newdmn->active_job = map->job;
orte_dss.copy((void**)&(newdmn->name), dmn->name, ORTE_NAME);
opal_list_append(&used_daemons, &newdmn->super);
if (0 == strcmp(node->nodename, nodename)) {
/* get the launch message only once - do it the first time
* through so all the nodes are still on the map!
*/
@ -86,7 +98,8 @@ int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(ndat);
return rc;
}
}
/* indicate that at least one daemon was found */
found = true;
}
/* procs on this node will be taken care of, so remove it from
@ -95,31 +108,29 @@ int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
opal_list_remove_item(&map->nodes, item2);
OBJ_RELEASE(item2);
}
/* move to next position */
item2 = next;
}
/* move to next position */
item2 = next;
}
if (!found) {
/* if no daemons were reused, then just return */
OBJ_DESTRUCT(&used_daemons);
return ORTE_SUCCESS;
rc = ORTE_SUCCESS;
goto CLEANUP;
}
/* store the bootproxy records */
orte_pls_base_store_active_daemons(&used_daemons);
/* launch any procs that are using existing daemons */
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_add_local_procs(ndat))) {
ORTE_ERROR_LOG(rc);
return rc;
}
OBJ_RELEASE(ndat);
/* cleanup */
while (NULL != (item = opal_list_remove_first(&used_daemons))) OBJ_RELEASE(item);
OBJ_DESTRUCT(&used_daemons);
CLEANUP:
for (i=0; i < cnt; i++) {
if (NULL != values[i]) OBJ_RELEASE(values[i]);
}
if (NULL != values) free(values);
return ORTE_SUCCESS;
return rc;
}

Просмотреть файл

@ -508,6 +508,31 @@ int main(int argc, char *argv[])
return ret;
}
/* THIS IS A TEMPORARY PATCH - REPORT NODE AND PROC NAME FOR THIS DAEMON */
if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND,
"orte-job-0", 2, 0))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens), ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_NODE_NAME_KEY, ORTE_STRING, orte_system_info.nodename))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_NAME_KEY, ORTE_NAME, ORTE_PROC_MY_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (ORTE_SUCCESS != (ret = orte_gpr.put(1, &value))) {
ORTE_ERROR_LOG(ret);
return ret;
}
OBJ_RELEASE(value);
/* get the job segment name */
if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) {
ORTE_ERROR_LOG(ret);