- Fix some compile issues in r11109
- indent / whitespace cleanup - don't set --daemon-debug when pls debug is given, as it seems to make the daemons abort. This commit was SVN r11113. The following SVN revision numbers were found above: r11109 --> open-mpi/ompi@da7df6d257
Этот коммит содержится в:
родитель
9f28258b3f
Коммит
16186978bb
@ -546,10 +546,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
|
|||||||
argc = 0;
|
argc = 0;
|
||||||
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
|
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
|
||||||
/* check for debug flags */
|
/* check for debug flags */
|
||||||
|
#if 0
|
||||||
if (mca_pls_bproc_component.debug) {
|
if (mca_pls_bproc_component.debug) {
|
||||||
opal_argv_append(&argc, &argv, "--debug");
|
opal_argv_append(&argc, &argv, "--debug");
|
||||||
opal_argv_append(&argc, &argv, "--debug-daemons");
|
opal_argv_append(&argc, &argv, "--debug-daemons");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
opal_argv_append(&argc, &argv, "--bootproxy");
|
opal_argv_append(&argc, &argv, "--bootproxy");
|
||||||
orte_ns.convert_jobid_to_string(¶m, jobid);
|
orte_ns.convert_jobid_to_string(¶m, jobid);
|
||||||
@ -662,12 +664,16 @@ cleanup:
|
|||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
|
||||||
void *user_tag) {
|
|
||||||
orte_gpr_value_t **values;
|
static void
|
||||||
|
orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
||||||
|
void *user_tag)
|
||||||
|
{
|
||||||
|
orte_gpr_value_t **values;
|
||||||
bool dead_node = false;
|
bool dead_node = false;
|
||||||
char *dead_node_name;
|
char *dead_node_name;
|
||||||
int i,j;
|
size_t i, j;
|
||||||
|
|
||||||
printf("inside check node state... \n");
|
printf("inside check node state... \n");
|
||||||
|
|
||||||
@ -677,92 +683,94 @@ static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
|
|||||||
|
|
||||||
values = (orte_gpr_value_t**)(notify_data->values)->addr;
|
values = (orte_gpr_value_t**)(notify_data->values)->addr;
|
||||||
for( j = 0; j < notify_data->cnt; j++) {
|
for( j = 0; j < notify_data->cnt; j++) {
|
||||||
dead_node = false;
|
dead_node = false;
|
||||||
for( i = 0; i < values[j]->cnt; i++) {
|
for( i = 0; i < values[j]->cnt; i++) {
|
||||||
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
|
orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
|
||||||
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
|
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
|
||||||
orte_node_state_t *node_state;
|
orte_node_state_t *node_state;
|
||||||
int ret;
|
int ret;
|
||||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if( *node_state == ORTE_NODE_STATE_DOWN ||
|
if( *node_state == ORTE_NODE_STATE_DOWN ||
|
||||||
*node_state == ORTE_NODE_STATE_REBOOT) {
|
*node_state == ORTE_NODE_STATE_REBOOT) {
|
||||||
dead_node = true;
|
dead_node = true;
|
||||||
printf("found a dead node state.. \n");
|
printf("found a dead node state.. \n");
|
||||||
}
|
}
|
||||||
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
|
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
|
||||||
char* tmp_name;
|
char* tmp_name;
|
||||||
int ret;
|
int ret;
|
||||||
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
|
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
dead_node_name = strdup(tmp_name);
|
dead_node_name = strdup(tmp_name);
|
||||||
printf("found a node named %s\n", dead_node_name);
|
printf("found a node named %s\n", dead_node_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
|
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
|
||||||
if(dead_node) {
|
if(dead_node) {
|
||||||
/* gotta see if this node belongs to us... arg.. */
|
/* gotta see if this node belongs to us... arg.. */
|
||||||
/* also, we know by order of creation that the node state */
|
/* also, we know by order of creation that the node state */
|
||||||
/* comes before the node name.. see soh_bproc.c */
|
/* comes before the node name.. see soh_bproc.c */
|
||||||
size_t name_idx;
|
size_t name_idx;
|
||||||
for (name_idx = 0;
|
for (name_idx = 0;
|
||||||
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
|
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
|
||||||
name_idx++) {
|
name_idx++) {
|
||||||
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
|
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
|
||||||
if(strcmp(node_name, dead_node_name) == 0){
|
if(strcmp(node_name, dead_node_name) == 0){
|
||||||
printf("this dead node %s belongs to us... \n", node_name);
|
/* one of our nodes up and died... */
|
||||||
/* one of our nodes up and died... */
|
/* not much to do other than die.... */
|
||||||
/* not much to do other than die.... */
|
int ret = ORTE_SUCCESS;
|
||||||
int ret, exit_status = ORTE_SUCCESS;
|
char *segment = NULL;
|
||||||
char *segment = NULL;
|
orte_gpr_value_t** seg_values = NULL;
|
||||||
orte_gpr_value_t** seg_values = NULL;
|
size_t num_values = 0;
|
||||||
size_t k, l, num_values = 0;
|
|
||||||
|
|
||||||
/**********************
|
/**********************
|
||||||
* Job Info segment
|
* Job Info segment
|
||||||
**********************/
|
**********************/
|
||||||
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
segment = strdup(ORTE_JOBINFO_SEGMENT);
|
||||||
|
|
||||||
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
|
||||||
segment,
|
segment,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
&num_values,
|
&num_values,
|
||||||
&seg_values ) ) ) {
|
&seg_values ) ) ) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* kill all the jobids that are not zero
|
* kill all the jobids that are not zero
|
||||||
*/
|
*/
|
||||||
for(i = 0; i < num_values; ++i) {
|
for(i = 0; i < num_values; ++i) {
|
||||||
orte_gpr_value_t* value = values[i];
|
orte_gpr_value_t* value = values[i];
|
||||||
orte_jobid_t jobid;
|
orte_jobid_t jobid;
|
||||||
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
|
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
|
||||||
printf("killing jobid %d\n", jobid);
|
printf("killing jobid %d\n", jobid);
|
||||||
if(jobid != 0)
|
if(jobid != 0)
|
||||||
orte_pls_bproc_terminate_job(jobid);
|
orte_pls_bproc_terminate_job(jobid);
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
* and kill everyone else
|
* and kill everyone else
|
||||||
*/
|
*/
|
||||||
printf("and go bye-bye...\n");
|
printf("and go bye-bye...\n");
|
||||||
orte_pls_bproc_terminate_job(0);
|
orte_pls_bproc_terminate_job(0);
|
||||||
/* shouldn't ever get here.. */
|
/* shouldn't ever get here.. */
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int orte_pls_bproc_monitor_nodes() {
|
|
||||||
|
static int
|
||||||
|
orte_pls_bproc_monitor_nodes(void)
|
||||||
|
{
|
||||||
orte_gpr_subscription_id_t id;
|
orte_gpr_subscription_id_t id;
|
||||||
return orte_gpr.subscribe_1(&id,
|
return orte_gpr.subscribe_1(&id,
|
||||||
NULL,
|
NULL,
|
||||||
@ -775,9 +783,9 @@ static int orte_pls_bproc_monitor_nodes() {
|
|||||||
strdup(ORTE_NODE_STATE_KEY),
|
strdup(ORTE_NODE_STATE_KEY),
|
||||||
orte_pls_bproc_check_node_state,
|
orte_pls_bproc_check_node_state,
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Launches the application processes
|
* Launches the application processes
|
||||||
* @param cellid the cellid of the job
|
* @param cellid the cellid of the job
|
||||||
@ -919,7 +927,7 @@ cleanup:
|
|||||||
* @retval error
|
* @retval error
|
||||||
*/
|
*/
|
||||||
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
||||||
opal_list_item_t* item;
|
opal_list_item_t* item, *item2;
|
||||||
opal_list_t mapping;
|
opal_list_t mapping;
|
||||||
orte_cellid_t cellid;
|
orte_cellid_t cellid;
|
||||||
orte_rmaps_base_map_t* map;
|
orte_rmaps_base_map_t* map;
|
||||||
@ -994,6 +1002,10 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(0 < mca_pls_bproc_component.debug) {
|
||||||
|
opal_output(0, "pls_bproc: --- starting to launch procs ---");
|
||||||
|
}
|
||||||
|
|
||||||
/* create an array to hold the pointers to the node arrays for each app
|
/* create an array to hold the pointers to the node arrays for each app
|
||||||
* context. Also, create an array to hold the lengths of the node arrays */
|
* context. Also, create an array to hold the lengths of the node arrays */
|
||||||
node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));
|
node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user