From 16186978bbefca2da1a7c2abbfb3fa565698f5ca Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Thu, 3 Aug 2006 18:51:42 +0000 Subject: [PATCH] - Fix some compile issues in r11109 - indent / whitespace cleanup - don't set --daemon-debug when pls debug is given, as it seems to make the daemons abort. This commit was SVN r11113. The following SVN revision numbers were found above: r11109 --> open-mpi/ompi@da7df6d2571e1f7a809e779a73f2036949bfa0b2 --- orte/mca/pls/bproc/pls_bproc.c | 178 ++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/orte/mca/pls/bproc/pls_bproc.c b/orte/mca/pls/bproc/pls_bproc.c index 93a54c4a1f..3b1d0e64ae 100644 --- a/orte/mca/pls/bproc/pls_bproc.c +++ b/orte/mca/pls/bproc/pls_bproc.c @@ -546,10 +546,12 @@ static int orte_pls_bproc_launch_daemons(orte_cellid_t cellid, char *** envp, argc = 0; opal_argv_append(&argc, &argv, mca_pls_bproc_component.orted); /* check for debug flags */ +#if 0 if (mca_pls_bproc_component.debug) { opal_argv_append(&argc, &argv, "--debug"); opal_argv_append(&argc, &argv, "--debug-daemons"); } +#endif opal_argv_append(&argc, &argv, "--bootproxy"); orte_ns.convert_jobid_to_string(¶m, jobid); @@ -662,12 +664,16 @@ cleanup: } return rc; } -static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data, - void *user_tag) { - orte_gpr_value_t **values; + + +static void +orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data, + void *user_tag) +{ + orte_gpr_value_t **values; bool dead_node = false; char *dead_node_name; - int i,j; + size_t i, j; printf("inside check node state... \n"); @@ -677,92 +683,94 @@ static void orte_pls_bproc_check_node_state(orte_gpr_notify_data_t *notify_data, values = (orte_gpr_value_t**)(notify_data->values)->addr; for( j = 0; j < notify_data->cnt; j++) { - dead_node = false; - for( i = 0; i < values[j]->cnt; i++) { - orte_gpr_keyval_t* keyval = values[j]->keyvals[i]; - if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) { - orte_node_state_t *node_state; - int ret; - if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) { - return; - } - if( *node_state == ORTE_NODE_STATE_DOWN || - *node_state == ORTE_NODE_STATE_REBOOT) { - dead_node = true; - printf("found a dead node state.. \n"); - } - } else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { - char* tmp_name; - int ret; - if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) { - return; - } - else { - dead_node_name = strdup(tmp_name); - printf("found a node named %s\n", dead_node_name); - } - } - } - printf("found a node named %s is dead? %d\n", dead_node_name, dead_node); - if(dead_node) { - /* gotta see if this node belongs to us... arg.. */ - /* also, we know by order of creation that the node state */ - /* comes before the node name.. see soh_bproc.c */ - size_t name_idx; - for (name_idx = 0; - name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names); - name_idx++) { - char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx); - if(strcmp(node_name, dead_node_name) == 0){ - printf("this dead node %s belongs to us... \n", node_name); - /* one of our nodes up and died... */ - /* not much to do other than die.... */ - int ret, exit_status = ORTE_SUCCESS; - char *segment = NULL; - orte_gpr_value_t** seg_values = NULL; - size_t k, l, num_values = 0; + dead_node = false; + for( i = 0; i < values[j]->cnt; i++) { + orte_gpr_keyval_t* keyval = values[j]->keyvals[i]; + if(strcmp(keyval->key, ORTE_NODE_STATE_KEY) == 0) { + orte_node_state_t *node_state; + int ret; + if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &node_state, keyval->value, ORTE_NODE_STATE))) { + return; + } + if( *node_state == ORTE_NODE_STATE_DOWN || + *node_state == ORTE_NODE_STATE_REBOOT) { + dead_node = true; + printf("found a dead node state.. \n"); + } + } else if(strcmp(keyval->key, ORTE_NODE_NAME_KEY) == 0) { + char* tmp_name; + int ret; + if( ORTE_SUCCESS != (ret = orte_dss.get( (void **) &tmp_name, keyval->value, ORTE_STRING))) { + return; + } + else { + dead_node_name = strdup(tmp_name); + printf("found a node named %s\n", dead_node_name); + } + } + } + printf("found a node named %s is dead? %d\n", dead_node_name, dead_node); + if(dead_node) { + /* gotta see if this node belongs to us... arg.. */ + /* also, we know by order of creation that the node state */ + /* comes before the node name.. see soh_bproc.c */ + size_t name_idx; + for (name_idx = 0; + name_idx < orte_pointer_array_get_size(mca_pls_bproc_component.active_node_names); + name_idx++) { + char* node_name = (char*) orte_pointer_array_get_item(mca_pls_bproc_component.active_node_names, name_idx); + if(strcmp(node_name, dead_node_name) == 0){ + /* one of our nodes up and died... */ + /* not much to do other than die.... */ + int ret = ORTE_SUCCESS; + char *segment = NULL; + orte_gpr_value_t** seg_values = NULL; + size_t num_values = 0; - /********************** - * Job Info segment - **********************/ - segment = strdup(ORTE_JOBINFO_SEGMENT); + /********************** + * Job Info segment + **********************/ + segment = strdup(ORTE_JOBINFO_SEGMENT); - if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, - segment, - NULL, - NULL, - &num_values, - &seg_values ) ) ) { + if( ORTE_SUCCESS != (ret = orte_gpr.get(ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_OR, + segment, + NULL, + NULL, + &num_values, + &seg_values ) ) ) { - } + } - /* - * kill all the jobids that are not zero - */ - for(i = 0; i < num_values; ++i) { - orte_gpr_value_t* value = values[i]; - orte_jobid_t jobid; - orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]); - printf("killing jobid %d\n", jobid); - if(jobid != 0) - orte_pls_bproc_terminate_job(jobid); - } - /* - * and kill everyone else - */ - printf("and go bye-bye...\n"); - orte_pls_bproc_terminate_job(0); - /* shouldn't ever get here.. */ - exit(1); - } + /* + * kill all the jobids that are not zero + */ + for(i = 0; i < num_values; ++i) { + orte_gpr_value_t* value = values[i]; + orte_jobid_t jobid; + orte_schema.extract_jobid_from_segment_name(&jobid, value->tokens[0]); + printf("killing jobid %d\n", jobid); + if(jobid != 0) + orte_pls_bproc_terminate_job(jobid); + } + /* + * and kill everyone else + */ + printf("and go bye-bye...\n"); + orte_pls_bproc_terminate_job(0); + /* shouldn't ever get here.. */ + exit(1); + } - } - } + } + } } } -static int orte_pls_bproc_monitor_nodes() { + +static int +orte_pls_bproc_monitor_nodes(void) +{ orte_gpr_subscription_id_t id; return orte_gpr.subscribe_1(&id, NULL, @@ -775,9 +783,9 @@ static int orte_pls_bproc_monitor_nodes() { strdup(ORTE_NODE_STATE_KEY), orte_pls_bproc_check_node_state, NULL); - } + /** * Launches the application processes * @param cellid the cellid of the job @@ -919,7 +927,7 @@ cleanup: * @retval error */ int orte_pls_bproc_launch(orte_jobid_t jobid) { - opal_list_item_t* item; + opal_list_item_t* item, *item2; opal_list_t mapping; orte_cellid_t cellid; orte_rmaps_base_map_t* map; @@ -994,6 +1002,10 @@ int orte_pls_bproc_launch(orte_jobid_t jobid) { } } + if(0 < mca_pls_bproc_component.debug) { + opal_output(0, "pls_bproc: --- starting to launch procs ---"); + } + /* create an array to hold the pointers to the node arrays for each app * context. Also, create an array to hold the lengths of the node arrays */ node_array = malloc(opal_list_get_size(&mapping) * sizeof(int *));