- Fix some compile issues in r11109
- indent / whitespace cleanup - don't set --daemon-debug when pls debug is given, as it seems to make the daemons abort. This commit was SVN r11113. The following SVN revision numbers were found above: r11109 --> open-mpi/ompi@da7df6d257
Этот коммит содержится в:
родитель
9f28258b3f
Коммит
16186978bb
@ -546,10 +546,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
|
||||
argc = 0;
|
||||
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
|
||||
/* check for debug flags */
|
||||
#if 0
|
||||
if (mca_pls_bproc_component.debug) {
|
||||
opal_argv_append(&argc, &argv, "--debug");
|
||||
opal_argv_append(&argc, &argv, "--debug-daemons");
|
||||
}
|
||||
#endif
|
||||
|
||||
opal_argv_append(&argc, &argv, "--bootproxy");
|
||||
orte_ns.convert_jobid_to_string(¶m, jobid);
|
||||
@ -662,12 +664,16 @@ cleanup:
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
||||
void *user_tag) {
|
||||
orte_gpr_value_t **values;
|
||||
|
||||
|
||||
static void
|
||||
orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
||||
void *user_tag)
|
||||
{
|
||||
orte_gpr_value_t **values;
|
||||
bool dead_node = false;
|
||||
char *dead_node_name;
|
||||
int i,j;
|
||||
size_t i, j;
|
||||
|
||||
printf("inside check node state... \n");
|
||||
|
||||
@ -677,92 +683,94 @@ static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
||||
|
||||
values = (orte_gpr_value_t**)(notify_data->values)->addr;
|
||||
for( j = 0; j < notify_data->cnt; j++) {
|
||||
dead_node = false;
|
||||
for( i = 0; i < values[j]->cnt; i++) {
|
||||
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
|
||||
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
|
||||
orte_node_state_t *node_state;
|
||||
int ret;
|
||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
|
||||
return;
|
||||
}
|
||||
if( *node_state == ORTE_NODE_STATE_DOWN ||
|
||||
*node_state == ORTE_NODE_STATE_REBOOT) {
|
||||
dead_node = true;
|
||||
printf("found a dead node state.. \n");
|
||||
}
|
||||
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||
char* tmp_name;
|
||||
int ret;
|
||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
|
||||
return;
|
||||
}
|
||||
else {
|
||||
dead_node_name = strdup(tmp_name);
|
||||
printf("found a node named %s\n", dead_node_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
|
||||
if(dead_node) {
|
||||
/* gotta see if this node belongs to us... arg.. */
|
||||
/* also, we know by order of creation that the node state */
|
||||
/* comes before the node name.. see soh_bproc.c */
|
||||
size_t name_idx;
|
||||
for (name_idx = 0;
|
||||
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
|
||||
name_idx++) {
|
||||
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
|
||||
if(strcmp(node_name, dead_node_name) == 0){
|
||||
printf("this dead node %s belongs to us... \n", node_name);
|
||||
/* one of our nodes up and died... */
|
||||
/* not much to do other than die.... */
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
char *segment = NULL;
|
||||
orte_gpr_value_t** seg_values = NULL;
|
||||
size_t k, l, num_values = 0;
|
||||
dead_node = false;
|
||||
for( i = 0; i < values[j]->cnt; i++) {
|
||||
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
|
||||
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
|
||||
orte_node_state_t *node_state;
|
||||
int ret;
|
||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
|
||||
return;
|
||||
}
|
||||
if( *node_state == ORTE_NODE_STATE_DOWN ||
|
||||
*node_state == ORTE_NODE_STATE_REBOOT) {
|
||||
dead_node = true;
|
||||
printf("found a dead node state.. \n");
|
||||
}
|
||||
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||
char* tmp_name;
|
||||
int ret;
|
||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
|
||||
return;
|
||||
}
|
||||
else {
|
||||
dead_node_name = strdup(tmp_name);
|
||||
printf("found a node named %s\n", dead_node_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
|
||||
if(dead_node) {
|
||||
/* gotta see if this node belongs to us... arg.. */
|
||||
/* also, we know by order of creation that the node state */
|
||||
/* comes before the node name.. see soh_bproc.c */
|
||||
size_t name_idx;
|
||||
for (name_idx = 0;
|
||||
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
|
||||
name_idx++) {
|
||||
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
|
||||
if(strcmp(node_name, dead_node_name) == 0){
|
||||
/* one of our nodes up and died... */
|
||||
/* not much to do other than die.... */
|
||||
int ret = ORTE_SUCCESS;
|
||||
char *segment = NULL;
|
||||
orte_gpr_value_t** seg_values = NULL;
|
||||
size_t num_values = 0;
|
||||
|
||||
/**********************
|
||||
* Job Info segment
|
||||
**********************/
|
||||
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
||||
/**********************
|
||||
* Job Info segment
|
||||
**********************/
|
||||
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
segment,
|
||||
NULL,
|
||||
NULL,
|
||||
&num_values,
|
||||
&seg_values ) ) ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||
segment,
|
||||
NULL,
|
||||
NULL,
|
||||
&num_values,
|
||||
&seg_values ) ) ) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* kill all the jobids that are not zero
|
||||
*/
|
||||
for(i = 0; i < num_values; ++i) {
|
||||
orte_gpr_value_t* value = values[i];
|
||||
orte_jobid_t jobid;
|
||||
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
|
||||
printf("killing jobid %d\n", jobid);
|
||||
if(jobid != 0)
|
||||
orte_pls_bproc_terminate_job(jobid);
|
||||
}
|
||||
/*
|
||||
* and kill everyone else
|
||||
*/
|
||||
printf("and go bye-bye...\n");
|
||||
orte_pls_bproc_terminate_job(0);
|
||||
/* shouldn't ever get here.. */
|
||||
exit(1);
|
||||
}
|
||||
/*
|
||||
* kill all the jobids that are not zero
|
||||
*/
|
||||
for(i = 0; i < num_values; ++i) {
|
||||
orte_gpr_value_t* value = values[i];
|
||||
orte_jobid_t jobid;
|
||||
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
|
||||
printf("killing jobid %d\n", jobid);
|
||||
if(jobid != 0)
|
||||
orte_pls_bproc_terminate_job(jobid);
|
||||
}
|
||||
/*
|
||||
* and kill everyone else
|
||||
*/
|
||||
printf("and go bye-bye...\n");
|
||||
orte_pls_bproc_terminate_job(0);
|
||||
/* shouldn't ever get here.. */
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int orte_pls_bproc_monitor_nodes() {
|
||||
|
||||
static int
|
||||
orte_pls_bproc_monitor_nodes(void)
|
||||
{
|
||||
orte_gpr_subscription_id_t id;
|
||||
return orte_gpr.subscribe_1(&id,
|
||||
NULL,
|
||||
@ -775,9 +783,9 @@ static int orte_pls_bproc_monitor_nodes() {
|
||||
strdup(ORTE_NODE_STATE_KEY),
|
||||
orte_pls_bproc_check_node_state,
|
||||
NULL);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Launches the application processes
|
||||
* @param cellid the cellid of the job
|
||||
@ -919,7 +927,7 @@ cleanup:
|
||||
* @retval error
|
||||
*/
|
||||
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
opal_list_item_t* item;
|
||||
opal_list_item_t* item, *item2;
|
||||
opal_list_t mapping;
|
||||
orte_cellid_t cellid;
|
||||
orte_rmaps_base_map_t* map;
|
||||
@ -994,6 +1002,10 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||
}
|
||||
}
|
||||
|
||||
if(0 < mca_pls_bproc_component.debug) {
|
||||
opal_output(0, "pls_bproc: --- starting to launch procs ---");
|
||||
}
|
||||
|
||||
/* create an array to hold the pointers to the node arrays for each app
|
||||
* context. Also, create an array to hold the lengths of the node arrays */
|
||||
node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user