Update reuse_daemons to find the daemons again - requires that orteds now report their nodenames (probably temporary patch pending upcoming minor revision of orted)
This commit was SVN r14533.
Этот коммит содержится в:
родитель
c733a7916b
Коммит
7d6d0a1c00
@ -27,7 +27,7 @@
|
||||
#include "orte/dss/dss.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/gpr/gpr_types.h"
|
||||
#include "orte/mca/gpr/gpr.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ns/ns_types.h"
|
||||
|
||||
@ -35,49 +35,61 @@
|
||||
|
||||
int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
|
||||
{
|
||||
opal_list_t avail_daemons;
|
||||
opal_list_item_t *item, *item2, *next;
|
||||
orte_pls_daemon_info_t *dmn, *newdmn;
|
||||
orte_gpr_value_t **values; /* the gpr initializes this to NULL */
|
||||
orte_gpr_keyval_t *kv;
|
||||
orte_std_cntr_t cnt, i;
|
||||
char *keys[] = {
|
||||
ORTE_NODE_NAME_KEY,
|
||||
NULL
|
||||
};
|
||||
opal_list_item_t *item2, *next;
|
||||
orte_mapped_node_t *node;
|
||||
opal_list_t used_daemons;
|
||||
orte_gpr_notify_data_t *ndat;
|
||||
bool found;
|
||||
char *nodename;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&avail_daemons, opal_list_t);
|
||||
OBJ_CONSTRUCT(&used_daemons, opal_list_t);
|
||||
|
||||
/* check for available daemons we could use */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_check_avail_daemons(&avail_daemons, map->job))) {
|
||||
|
||||
/* query the daemon info */
|
||||
if (ORTE_SUCCESS != (rc = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
"orte-job-0", /* the daemon job segment */
|
||||
NULL, /* all containers */
|
||||
keys,
|
||||
&cnt, &values))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* go through the list, checking nodenames against what is in the
|
||||
/* if no daemons are around (except HNP), then don't worry about this */
|
||||
if (cnt <= 1) {
|
||||
rc = ORTE_SUCCESS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* get here if some daemons, other than HNP, exist
|
||||
* go through the list, checking nodenames against what is in the
|
||||
* map. If nodes match, then construct and send an appropriate command
|
||||
* to that daemon to launch the local procs - remove that node structure
|
||||
* from the map so that the main launcher doesn't also try to start procs
|
||||
* on that node!
|
||||
*/
|
||||
|
||||
found = false;
|
||||
while (NULL != (item = opal_list_remove_first(&avail_daemons))) {
|
||||
dmn = (orte_pls_daemon_info_t*)item;
|
||||
item2 = opal_list_get_first(&map->nodes);
|
||||
while (item2 != opal_list_get_end(&map->nodes)) {
|
||||
node = (orte_mapped_node_t*)item2;
|
||||
|
||||
item2 = opal_list_get_first(&map->nodes);
|
||||
while (item2 != opal_list_get_end(&map->nodes)) {
|
||||
node = (orte_mapped_node_t*)item2;
|
||||
/* save the next position in case we remove this one */
|
||||
next = opal_list_get_next(item2);
|
||||
|
||||
/* check the returned values and see if the nodenames match */
|
||||
for (i=0; i < cnt; i++) {
|
||||
kv = values[i]->keyvals[0];
|
||||
if (ORTE_SUCCESS != (rc = orte_dss.get((void**)&nodename, kv->value, ORTE_STRING))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* save the next position in case we remove this one */
|
||||
next = opal_list_get_next(item2);
|
||||
|
||||
if (0 == strcmp(node->nodename, dmn->nodename)) {
|
||||
newdmn = OBJ_NEW(orte_pls_daemon_info_t);
|
||||
newdmn->cell = dmn->cell;
|
||||
newdmn->nodename = strdup(dmn->nodename);
|
||||
newdmn->active_job = map->job;
|
||||
orte_dss.copy((void**)&(newdmn->name), dmn->name, ORTE_NAME);
|
||||
opal_list_append(&used_daemons, &newdmn->super);
|
||||
if (0 == strcmp(node->nodename, nodename)) {
|
||||
/* get the launch message only once - do it the first time
|
||||
* through so all the nodes are still on the map!
|
||||
*/
|
||||
@ -86,7 +98,8 @@ int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(ndat);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
/* indicate that at least one daemon was found */
|
||||
found = true;
|
||||
}
|
||||
/* procs on this node will be taken care of, so remove it from
|
||||
@ -95,31 +108,29 @@ int orte_pls_base_launch_on_existing_daemons(orte_job_map_t *map)
|
||||
opal_list_remove_item(&map->nodes, item2);
|
||||
OBJ_RELEASE(item2);
|
||||
}
|
||||
|
||||
/* move to next position */
|
||||
item2 = next;
|
||||
}
|
||||
|
||||
/* move to next position */
|
||||
item2 = next;
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
/* if no daemons were reused, then just return */
|
||||
OBJ_DESTRUCT(&used_daemons);
|
||||
return ORTE_SUCCESS;
|
||||
rc = ORTE_SUCCESS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* store the bootproxy records */
|
||||
orte_pls_base_store_active_daemons(&used_daemons);
|
||||
|
||||
/* launch any procs that are using existing daemons */
|
||||
if (ORTE_SUCCESS != (rc = orte_pls_base_orted_add_local_procs(ndat))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
OBJ_RELEASE(ndat);
|
||||
|
||||
/* cleanup */
|
||||
while (NULL != (item = opal_list_remove_first(&used_daemons))) OBJ_RELEASE(item);
|
||||
OBJ_DESTRUCT(&used_daemons);
|
||||
CLEANUP:
|
||||
for (i=0; i < cnt; i++) {
|
||||
if (NULL != values[i]) OBJ_RELEASE(values[i]);
|
||||
}
|
||||
if (NULL != values) free(values);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
return rc;
|
||||
}
|
||||
|
@ -508,6 +508,31 @@ int main(int argc, char *argv[])
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* THIS IS A TEMPORARY PATCH - REPORT NODE AND PROC NAME FOR THIS DAEMON */
|
||||
if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND,
|
||||
"orte-job-0", 2, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_schema.get_proc_tokens(&(value->tokens), &(value->num_tokens), ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTE_NODE_NAME_KEY, ORTE_STRING, orte_system_info.nodename))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[1]), ORTE_PROC_NAME_KEY, ORTE_NAME, ORTE_PROC_MY_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_gpr.put(1, &value))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
OBJ_RELEASE(value);
|
||||
|
||||
|
||||
/* get the job segment name */
|
||||
if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user