1
1

- Fix some compile issues in r11109

- indent / whitespace cleanup
- don't set --daemon-debug when pls debug is given, as it seems to make
  the daemons abort.

This commit was SVN r11113.

The following SVN revision numbers were found above:
  r11109 --> open-mpi/ompi@da7df6d257
Этот коммит содержится в:
Brian Barrett 2006-08-03 18:51:42 +00:00
родитель 9f28258b3f
Коммит 16186978bb

Просмотреть файл

@ -546,10 +546,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
argc = 0;
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
/* check for debug flags */
#if 0
if (mca_pls_bproc_component.debug) {
opal_argv_append(&argc, &argv, "--debug");
opal_argv_append(&argc, &argv, "--debug-daemons");
}
#endif
opal_argv_append(&argc, &argv, "--bootproxy");
orte_ns.convert_jobid_to_string(&param, jobid);
@ -662,12 +664,16 @@ cleanup:
}
return rc;
}
static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
void *user_tag) {
orte_gpr_value_t **values;
static void
orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
void *user_tag)
{
orte_gpr_value_t **values;
bool dead_node = false;
char *dead_node_name;
int i,j;
size_t i, j;
printf("inside check node state... \n");
@ -677,92 +683,94 @@ static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
values = (orte_gpr_value_t**)(notify_data->values)->addr;
for( j = 0; j < notify_data->cnt; j++) {
dead_node = false;
for( i = 0; i < values[j]->cnt; i++) {
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
orte_node_state_t *node_state;
int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
return;
}
if( *node_state == ORTE_NODE_STATE_DOWN ||
*node_state == ORTE_NODE_STATE_REBOOT) {
dead_node = true;
printf("found a dead node state.. \n");
}
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
char* tmp_name;
int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
return;
}
else {
dead_node_name = strdup(tmp_name);
printf("found a node named %s\n", dead_node_name);
}
}
}
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
if(dead_node) {
/* gotta see if this node belongs to us... arg.. */
/* also, we know by order of creation that the node state */
/* comes before the node name.. see soh_bproc.c */
size_t name_idx;
for (name_idx = 0;
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
name_idx++) {
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
if(strcmp(node_name, dead_node_name) == 0){
printf("this dead node %s belongs to us... \n", node_name);
/* one of our nodes up and died... */
/* not much to do other than die.... */
int ret, exit_status = ORTE_SUCCESS;
char *segment = NULL;
orte_gpr_value_t** seg_values = NULL;
size_t k, l, num_values = 0;
dead_node = false;
for( i = 0; i < values[j]->cnt; i++) {
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
orte_node_state_t *node_state;
int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
return;
}
if( *node_state == ORTE_NODE_STATE_DOWN ||
*node_state == ORTE_NODE_STATE_REBOOT) {
dead_node = true;
printf("found a dead node state.. \n");
}
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
char* tmp_name;
int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
return;
}
else {
dead_node_name = strdup(tmp_name);
printf("found a node named %s\n", dead_node_name);
}
}
}
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
if(dead_node) {
/* gotta see if this node belongs to us... arg.. */
/* also, we know by order of creation that the node state */
/* comes before the node name.. see soh_bproc.c */
size_t name_idx;
for (name_idx = 0;
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
name_idx++) {
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
if(strcmp(node_name, dead_node_name) == 0){
/* one of our nodes up and died... */
/* not much to do other than die.... */
int ret = ORTE_SUCCESS;
char *segment = NULL;
orte_gpr_value_t** seg_values = NULL;
size_t num_values = 0;
/**********************
* Job Info segment
**********************/
segment = strdup(ORTE_JOBINFO_SEGMENT);
/**********************
* Job Info segment
**********************/
segment = strdup(ORTE_JOBINFO_SEGMENT);
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
NULL,
&num_values,
&seg_values ) ) ) {
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment,
NULL,
NULL,
&num_values,
&seg_values ) ) ) {
}
}
/*
* kill all the jobids that are not zero
*/
for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i];
orte_jobid_t jobid;
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
printf("killing jobid %d\n", jobid);
if(jobid != 0)
orte_pls_bproc_terminate_job(jobid);
}
/*
* and kill everyone else
*/
printf("and go bye-bye...\n");
orte_pls_bproc_terminate_job(0);
/* shouldn't ever get here.. */
exit(1);
}
/*
* kill all the jobids that are not zero
*/
for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i];
orte_jobid_t jobid;
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
printf("killing jobid %d\n", jobid);
if(jobid != 0)
orte_pls_bproc_terminate_job(jobid);
}
/*
* and kill everyone else
*/
printf("and go bye-bye...\n");
orte_pls_bproc_terminate_job(0);
/* shouldn't ever get here.. */
exit(1);
}
}
}
}
}
}
}
static int orte_pls_bproc_monitor_nodes() {
static int
orte_pls_bproc_monitor_nodes(void)
{
orte_gpr_subscription_id_t id;
return orte_gpr.subscribe_1(&id,
NULL,
@ -775,9 +783,9 @@ static int orte_pls_bproc_monitor_nodes() {
strdup(ORTE_NODE_STATE_KEY),
orte_pls_bproc_check_node_state,
NULL);
}
/**
* Launches the application processes
* @param cellid the cellid of the job
@ -919,7 +927,7 @@ cleanup:
* @retval error
*/
int orte_pls_bproc_launch(orte_jobid_t jobid) {
opal_list_item_t* item;
opal_list_item_t* item, *item2;
opal_list_t mapping;
orte_cellid_t cellid;
orte_rmaps_base_map_t* map;
@ -994,6 +1002,10 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
}
}
if(0 < mca_pls_bproc_component.debug) {
opal_output(0, "pls_bproc: --- starting to launch procs ---");
}
/* create an array to hold the pointers to the node arrays for each app
* context. Also, create an array to hold the lengths of the node arrays */
node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));