1
1

- Fix some compile issues in r11109

- indent / whitespace cleanup
- don't set --daemon-debug when pls debug is given, as it seems to make
  the daemons abort.

This commit was SVN r11113.

The following SVN revision numbers were found above:
  r11109 --> open-mpi/ompi@da7df6d257
Этот коммит содержится в:
Brian Barrett 2006-08-03 18:51:42 +00:00
родитель 9f28258b3f
Коммит 16186978bb

Просмотреть файл

@ -546,10 +546,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp,
argc = 0; argc = 0;
opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted); opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted);
/* check for debug flags */ /* check for debug flags */
#if 0
if (mca_pls_bproc_component.debug) { if (mca_pls_bproc_component.debug) {
opal_argv_append(&argc, &argv, "--debug"); opal_argv_append(&argc, &argv, "--debug");
opal_argv_append(&argc, &argv, "--debug-daemons"); opal_argv_append(&argc, &argv, "--debug-daemons");
} }
#endif
opal_argv_append(&argc, &argv, "--bootproxy"); opal_argv_append(&argc, &argv, "--bootproxy");
orte_ns.convert_jobid_to_string(&param, jobid); orte_ns.convert_jobid_to_string(&param, jobid);
@ -662,12 +664,16 @@ cleanup:
} }
return rc; return rc;
} }
static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
void *user_tag) {
orte_gpr_value_t **values; static void
orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
void *user_tag)
{
orte_gpr_value_t **values;
bool dead_node = false; bool dead_node = false;
char *dead_node_name; char *dead_node_name;
int i,j; size_t i, j;
printf("inside check node state... \n"); printf("inside check node state... \n");
@ -677,92 +683,94 @@ static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data,
values = (orte_gpr_value_t**)(notify_data->values)->addr; values = (orte_gpr_value_t**)(notify_data->values)->addr;
for( j = 0; j < notify_data->cnt; j++) { for( j = 0; j < notify_data->cnt; j++) {
dead_node = false; dead_node = false;
for( i = 0; i < values[j]->cnt; i++) { for( i = 0; i < values[j]->cnt; i++) {
orte_gpr_keyval_t* keyval = values[j]->keyvals[i]; orte_gpr_keyval_t* keyval = values[j]->keyvals[i];
if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) { if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) {
orte_node_state_t *node_state; orte_node_state_t *node_state;
int ret; int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) {
return; return;
} }
if( *node_state == ORTE_NODE_STATE_DOWN || if( *node_state == ORTE_NODE_STATE_DOWN ||
*node_state == ORTE_NODE_STATE_REBOOT) { *node_state == ORTE_NODE_STATE_REBOOT) {
dead_node = true; dead_node = true;
printf("found a dead node state.. \n"); printf("found a dead node state.. \n");
} }
} else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { } else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) {
char* tmp_name; char* tmp_name;
int ret; int ret;
if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) { if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) {
return; return;
} }
else { else {
dead_node_name = strdup(tmp_name); dead_node_name = strdup(tmp_name);
printf("found a node named %s\n", dead_node_name); printf("found a node named %s\n", dead_node_name);
} }
} }
} }
printf("found a node named %s is dead? %d\n", dead_node_name, dead_node); printf("found a node named %s is dead? %d\n", dead_node_name, dead_node);
if(dead_node) { if(dead_node) {
/* gotta see if this node belongs to us... arg.. */ /* gotta see if this node belongs to us... arg.. */
/* also, we know by order of creation that the node state */ /* also, we know by order of creation that the node state */
/* comes before the node name.. see soh_bproc.c */ /* comes before the node name.. see soh_bproc.c */
size_t name_idx; size_t name_idx;
for (name_idx = 0; for (name_idx = 0;
name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names); name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names);
name_idx++) { name_idx++) {
char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx); char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx);
if(strcmp(node_name, dead_node_name) == 0){ if(strcmp(node_name, dead_node_name) == 0){
printf("this dead node %s belongs to us... \n", node_name); /* one of our nodes up and died... */
/* one of our nodes up and died... */ /* not much to do other than die.... */
/* not much to do other than die.... */ int ret = ORTE_SUCCESS;
int ret, exit_status = ORTE_SUCCESS; char *segment = NULL;
char *segment = NULL; orte_gpr_value_t** seg_values = NULL;
orte_gpr_value_t** seg_values = NULL; size_t num_values = 0;
size_t k, l, num_values = 0;
/********************** /**********************
* Job Info segment * Job Info segment
**********************/ **********************/
segment = strdup(ORTE_JOBINFO_SEGMENT); segment = strdup(ORTE_JOBINFO_SEGMENT);
if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR,
segment, segment,
NULL, NULL,
NULL, NULL,
&num_values, &num_values,
&seg_values ) ) ) { &seg_values ) ) ) {
} }
/* /*
* kill all the jobids that are not zero * kill all the jobids that are not zero
*/ */
for(i = 0; i < num_values; ++i) { for(i = 0; i < num_values; ++i) {
orte_gpr_value_t* value = values[i]; orte_gpr_value_t* value = values[i];
orte_jobid_t jobid; orte_jobid_t jobid;
orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]); orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]);
printf("killing jobid %d\n", jobid); printf("killing jobid %d\n", jobid);
if(jobid != 0) if(jobid != 0)
orte_pls_bproc_terminate_job(jobid); orte_pls_bproc_terminate_job(jobid);
} }
/* /*
* and kill everyone else * and kill everyone else
*/ */
printf("and go bye-bye...\n"); printf("and go bye-bye...\n");
orte_pls_bproc_terminate_job(0); orte_pls_bproc_terminate_job(0);
/* shouldn't ever get here.. */ /* shouldn't ever get here.. */
exit(1); exit(1);
} }
} }
} }
} }
} }
static int orte_pls_bproc_monitor_nodes() {
static int
orte_pls_bproc_monitor_nodes(void)
{
orte_gpr_subscription_id_t id; orte_gpr_subscription_id_t id;
return orte_gpr.subscribe_1(&id, return orte_gpr.subscribe_1(&id,
NULL, NULL,
@ -775,9 +783,9 @@ static int orte_pls_bproc_monitor_nodes() {
strdup(ORTE_NODE_STATE_KEY), strdup(ORTE_NODE_STATE_KEY),
orte_pls_bproc_check_node_state, orte_pls_bproc_check_node_state,
NULL); NULL);
} }
/** /**
* Launches the application processes * Launches the application processes
* @param cellid the cellid of the job * @param cellid the cellid of the job
@ -919,7 +927,7 @@ cleanup:
* @retval error * @retval error
*/ */
int orte_pls_bproc_launch(orte_jobid_t jobid) { int orte_pls_bproc_launch(orte_jobid_t jobid) {
opal_list_item_t* item; opal_list_item_t* item, *item2;
opal_list_t mapping; opal_list_t mapping;
orte_cellid_t cellid; orte_cellid_t cellid;
orte_rmaps_base_map_t* map; orte_rmaps_base_map_t* map;
@ -994,6 +1002,10 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) {
} }
} }
if(0 < mca_pls_bproc_component.debug) {
opal_output(0, "pls_bproc: --- starting to launch procs ---");
}
/* create an array to hold the pointers to the node arrays for each app /* create an array to hold the pointers to the node arrays for each app
* context. Also, create an array to hold the lengths of the node arrays */ * context. Also, create an array to hold the lengths of the node arrays */
node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *)); node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));