diff --git a/orte/mca/ess/cm/ess_cm.h b/orte/mca/ess/cm/ess_cm.h index 176b7ea546..7a7e70f34f 100644 --- a/orte/mca/ess/cm/ess_cm.h +++ b/orte/mca/ess/cm/ess_cm.h @@ -20,8 +20,12 @@ int orte_ess_cm_component_open(void); int orte_ess_cm_component_close(void); int orte_ess_cm_component_query(mca_base_module_t **module, int *priority); +typedef struct { + orte_ess_base_component_t super; + int max_slots; +} orte_ess_cm_component_t; -ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_cm_component; +ORTE_MODULE_DECLSPEC extern orte_ess_cm_component_t mca_ess_cm_component; END_C_DECLS diff --git a/orte/mca/ess/cm/ess_cm_component.c b/orte/mca/ess/cm/ess_cm_component.c index 37c507aed8..8d4729afe7 100644 --- a/orte/mca/ess/cm/ess_cm_component.c +++ b/orte/mca/ess/cm/ess_cm_component.c @@ -20,6 +20,7 @@ #include "opal/mca/base/mca_base_param.h" #include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/cm/ess_cm.h" @@ -30,24 +31,26 @@ extern orte_ess_base_module_t orte_ess_cm_module; * Instantiate the public struct with all of our public information * and pointers to our public functions in it */ -orte_ess_base_component_t mca_ess_cm_component = { +orte_ess_cm_component_t mca_ess_cm_component = { { - ORTE_ESS_BASE_VERSION_2_0_0, - - /* Component name and version */ - "cm", - ORTE_MAJOR_VERSION, - ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION, - - /* Component open and close functions */ - orte_ess_cm_component_open, - orte_ess_cm_component_close, - orte_ess_cm_component_query - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT + { + ORTE_ESS_BASE_VERSION_2_0_0, + + /* Component name and version */ + "cm", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_ess_cm_component_open, + orte_ess_cm_component_close, + orte_ess_cm_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } } }; @@ -55,23 +58,22 @@ orte_ess_base_component_t mca_ess_cm_component = { int orte_ess_cm_component_open(void) { + mca_base_component_t *c = &mca_ess_cm_component.super.base_version; + + mca_base_param_reg_int(c, "max_slots", + "Max #slots/rack (must be > 0)", + false, false, 38, &mca_ess_cm_component.max_slots); + return ORTE_SUCCESS; } int orte_ess_cm_component_query(mca_base_module_t **module, int *priority) { - /* if we are a cm, we want to be selected */ - if (ORTE_PROC_IS_CM || ORTE_PROC_IS_CM_APP) { - *priority = 100; - *module = (mca_base_module_t *)&orte_ess_cm_module; - return ORTE_SUCCESS; - } - - /* else, don't */ - *priority = -1; - *module = NULL; - return ORTE_ERROR; + /* only select us if specified */ + *priority = 0; + *module = (mca_base_module_t *)&orte_ess_cm_module; + return ORTE_SUCCESS; } diff --git a/orte/mca/ess/cm/ess_cm_module.c b/orte/mca/ess/cm/ess_cm_module.c index 26487b81dd..755b050e57 100644 --- a/orte/mca/ess/cm/ess_cm_module.c +++ b/orte/mca/ess/cm/ess_cm_module.c @@ -24,22 +24,16 @@ #include #endif +#include "opal/util/argv.h" +#include "opal/util/if.h" +#include "opal/mca/paffinity/paffinity.h" +#include "orte/mca/rmcast/base/base.h" +#include "orte/mca/errmgr/errmgr.h" #include "orte/util/show_help.h" -#include "orte/mca/iof/base/base.h" -#include "orte/mca/plm/base/base.h" -#include "orte/mca/plm/plm.h" -#include "orte/mca/ras/base/base.h" -#include "orte/mca/rml/base/base.h" -#include "orte/mca/rml/rml_types.h" -#include "orte/mca/routed/base/base.h" -#include "orte/mca/routed/routed.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/grpcomm/base/base.h" -#include "orte/mca/odls/base/base.h" -#include "orte/mca/notifier/base/base.h" -#include "orte/mca/rmaps/base/base.h" #include "orte/util/proc_info.h" +#include "orte/util/name_fns.h" +#include "orte/util/nidmap.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_globals.h" @@ -50,7 +44,9 @@ static int rte_init(void); static int rte_finalize(void); static void rte_abort(int status, bool report) __opal_attribute_noreturn__; +static uint8_t proc_get_locality(orte_process_name_t *proc); static orte_vpid_t proc_get_daemon(orte_process_name_t *proc); +static char* proc_get_hostname(orte_process_name_t *proc); static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc); static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc); static int update_pidmap(opal_byte_object_t *bo); @@ -61,9 +57,9 @@ orte_ess_base_module_t orte_ess_cm_module = { rte_init, rte_finalize, rte_abort, - NULL, /* don't need a local procs fn */ + proc_get_locality, proc_get_daemon, - NULL, /* don't need a proc_get_hostname fn */ + proc_get_hostname, proc_get_local_rank, proc_get_node_rank, update_pidmap, @@ -75,102 +71,15 @@ static int cm_set_name(void); static int rte_init(void) { - orte_job_t *jdata; - orte_node_t *node; - orte_proc_t *proc; int ret; char *error = NULL; - - if (ORTE_PROC_IS_CM_APP) { - /* get our name out of the environment */ - if (ORTE_SUCCESS != (ret = cm_set_name())) { - error = "orte_ess_cm_set_name"; - goto error; - } - - /* run the prolog */ - if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { - error = "orte_ess_base_std_prolog"; - goto error; - } - - /* Setup the communication infrastructure */ - /* - * Runtime Messaging Layer - */ - if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_select"; - goto error; - } - /* - * Routed system - */ - if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_select"; - goto error; - } - /* - * Group communications - */ - if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_select"; - goto error; - } - - /* enable communication with the rml */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml.enable_comm"; - goto error; - } - - /* setup the routed info - the selected routed component - * will know what to do. - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } - - /* setup the notifier system */ - if (ORTE_SUCCESS != (ret = orte_notifier_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_notifer_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_notifer_select"; - goto error; - } - - return ORTE_SUCCESS; - } - - - /* if we are not an APP, then we are the CM itself, so setup - * for that role + char **hosts = NULL; + char *nodelist; + + /* only daemons that are bootstrapping should + * be calling this module */ - + /* initialize the global list of local children and job data */ OBJ_CONSTRUCT(&orte_local_children, opal_list_t); OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); @@ -181,213 +90,42 @@ static int rte_init(void) goto error; } - /* Open the PLM so we can start processes */ - if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_plm_base_open"; - goto error; - } - - if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_plm_base_select"; - goto error; - } - - /* give ourselves a unique name */ - if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) { - ORTE_ERROR_LOG(ret); - error = "orte_plm_set_hnp_name"; - goto error; - } - - /* Setup the communication infrastructure */ - /* - * Runtime Messaging Layer + /* open the reliable multicast framework, just in + * case we need it to query the HNP for a name */ - if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_open())) { ORTE_ERROR_LOG(ret); - error = "orte_rml_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml_base_select"; - goto error; - } - /* - * Routed system - */ - if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_routed_base_select"; - goto error; - } - /* - * Group communications - */ - if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_grpcomm_base_select"; + error = "orte_rmcast_base_open"; goto error; } - /* Now provide a chance for the PLM - * to perform any module-specific init functions. This - * needs to occur AFTER the communications are setup - * as it may involve starting a non-blocking recv - */ - if (ORTE_SUCCESS != (ret = orte_plm.init())) { + if (ORTE_SUCCESS != (ret = orte_rmcast_base_select())) { ORTE_ERROR_LOG(ret); - error = "orte_plm_init"; + error = "orte_rmcast_base_select"; goto error; } - if (ORTE_SUCCESS != (ret = orte_ras_base_open())) { + /* get a name for ourselves */ + if (ORTE_SUCCESS != (ret = cm_set_name())) { + error = "set_name"; + goto error; + } + + /* get the list of nodes used for this job */ + nodelist = getenv("OMPI_MCA_orte_nodelist"); + + if (NULL != nodelist) { + /* split the node list into an argv array */ + hosts = opal_argv_split(nodelist, ','); + } + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); - error = "orte_ras_base_open"; + error = "orte_ess_base_orted_setup"; goto error; } + opal_argv_free(hosts); + return ORTE_SUCCESS; - if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_ras_base_find_available"; - goto error; - } - - if (ORTE_SUCCESS != (ret = orte_rmaps_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_rmaps_base_open"; - goto error; - } - - if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_rmaps_base_find_available"; - goto error; - } - - if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { - error = "orte_errmgr_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_errmgr_base_select"; - goto error; - } - - /* Open/select the odls */ - if (ORTE_SUCCESS != (ret = orte_odls_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_odls_base_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_odls_base_select"; - goto error; - } - - /* enable communication with the rml */ - if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { - ORTE_ERROR_LOG(ret); - error = "orte_rml.enable_comm"; - goto error; - } - - /* setup the global job and node arrays */ - orte_job_data = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, - 1, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - 1))) { - ORTE_ERROR_LOG(ret); - error = "setup job array"; - goto error; - } - - orte_node_pool = OBJ_NEW(opal_pointer_array_t); - if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { - ORTE_ERROR_LOG(ret); - error = "setup node array"; - goto error; - } - - /* Setup the job data object for the daemons */ - /* create and store the job data object */ - jdata = OBJ_NEW(orte_job_t); - jdata->jobid = ORTE_PROC_MY_NAME->jobid; - opal_pointer_array_set_item(orte_job_data, 0, jdata); - - /* create and store a node object where we are */ - node = OBJ_NEW(orte_node_t); - node->name = strdup(orte_process_info.nodename); - node->index = opal_pointer_array_add(orte_node_pool, node); - - /* create and store a proc object for us */ - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = ORTE_PROC_MY_NAME->jobid; - proc->name.vpid = ORTE_PROC_MY_NAME->vpid; - proc->pid = orte_process_info.pid; - proc->rml_uri = orte_rml.get_contact_info(); - proc->state = ORTE_PROC_STATE_RUNNING; - OBJ_RETAIN(node); /* keep accounting straight */ - proc->node = node; - proc->nodename = node->name; - opal_pointer_array_add(jdata->procs, proc); - - /* record that the daemon (i.e., us) is on this node - * NOTE: we do not add the proc object to the node's - * proc array because we are not an application proc. - * Instead, we record it in the daemon field of the - * node object - */ - OBJ_RETAIN(proc); /* keep accounting straight */ - node->daemon = proc; - node->daemon_launched = true; - node->state = ORTE_NODE_STATE_UP; - - /* record that the daemon job is running */ - jdata->num_procs = 1; - jdata->state = ORTE_JOB_STATE_RUNNING; - - /* setup the routed info - the selected routed component - * will know what to do. - */ - if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { - ORTE_ERROR_LOG(ret); - error = "orte_routed.init_routes"; - goto error; - } - - /* setup the notifier system */ - if (ORTE_SUCCESS != (ret = orte_notifier_base_open())) { - ORTE_ERROR_LOG(ret); - error = "orte_notifer_open"; - goto error; - } - if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { - ORTE_ERROR_LOG(ret); - error = "orte_notifer_select"; - goto error; - } - - return ORTE_SUCCESS; - error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", @@ -398,62 +136,16 @@ error: static int rte_finalize(void) { - opal_list_item_t *item; - orte_node_t *node; - orte_job_t *job; - int i; + int ret; - orte_notifier_base_close(); - - orte_odls_base_close(); - - orte_wait_finalize(); - - /* finalize selected modules so they can de-register - * any receives - */ - orte_ras_base_close(); - orte_rmaps_base_close(); - orte_plm_base_close(); - orte_errmgr_base_close(); - - /* now can close the rml and its friendly group comm */ - orte_grpcomm_base_close(); - orte_routed_base_close(); - orte_rml_base_close(); - - /* cleanup the global list of local children and job data */ - while (NULL != (item = opal_list_remove_first(&orte_local_children))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&orte_local_children); - while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) { - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&orte_local_jobdata); - - /* cleanup the job and node info arrays */ - if (NULL != orte_node_pool) { - for (i=0; i < orte_node_pool->size; i++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) { - OBJ_RELEASE(node); - } - } - OBJ_RELEASE(orte_node_pool); - } - if (NULL != orte_job_data) { - for (i=0; i < orte_job_data->size; i++) { - if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) { - OBJ_RELEASE(job); - } - } - OBJ_RELEASE(orte_job_data); + if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { + ORTE_ERROR_LOG(ret); } - /* clean out the global structures */ - orte_proc_info_finalize(); + /* deconstruct the nidmap and jobmap arrays */ + orte_util_nidmap_finalize(); - return ORTE_SUCCESS; + return ret; } /* @@ -485,143 +177,248 @@ static void rte_abort(int status, bool report) exit(status); } -static int cm_set_name(void) +static uint8_t proc_get_locality(orte_process_name_t *proc) { - char *jobid_str, *procid_str; - int id, rc; - orte_jobid_t jobid; - orte_vpid_t vpid; + orte_nid_t *nid; - id = mca_base_param_register_string("orte", "ess", "jobid", NULL, NULL); - mca_base_param_lookup_string(id, &jobid_str); - if (NULL == jobid_str) { + if (NULL == (nid = orte_util_lookup_nid(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, jobid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(jobid_str); - - id = mca_base_param_register_string("orte", "ess", "vpid", NULL, NULL); - mca_base_param_lookup_string(id, &procid_str); - if (NULL == procid_str) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, procid_str))) { - ORTE_ERROR_LOG(rc); - return(rc); - } - free(procid_str); - - ORTE_PROC_MY_NAME->jobid = jobid; - ORTE_PROC_MY_NAME->vpid = vpid; - - OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, - "ess:cm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* get the non-name common environmental variables */ - if (ORTE_SUCCESS != (rc = orte_ess_env_get())) { - ORTE_ERROR_LOG(rc); - return rc; + return OPAL_PROC_NON_LOCAL; } - return ORTE_SUCCESS; -} - -static orte_proc_t* find_proc(orte_process_name_t *proc) -{ - orte_job_t *jdata; - - if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { - return NULL; + if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) { + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:cm: proc %s on LOCAL NODE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + return (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER); } - return (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, (int)proc->vpid); + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:cm: proc %s is REMOTE", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + return OPAL_PROC_NON_LOCAL; + } static orte_vpid_t proc_get_daemon(orte_process_name_t *proc) { - orte_proc_t *pdata; + orte_nid_t *nid; if( ORTE_JOBID_IS_DAEMON(proc->jobid) ) { return proc->vpid; } - - /* get the job data */ - if (NULL == (pdata = find_proc(proc))) { + + if (NULL == (nid = orte_util_lookup_nid(proc))) { return ORTE_VPID_INVALID; } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:hnp: proc %s is hosted by daemon %s", + "%s ess:cm: proc %s is hosted by daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), - ORTE_VPID_PRINT(pdata->node->daemon->name.vpid))); + ORTE_VPID_PRINT(nid->daemon))); - return pdata->node->daemon->name.vpid; + return nid->daemon; +} + +static char* proc_get_hostname(orte_process_name_t *proc) +{ + orte_nid_t *nid; + + if (NULL == (nid = orte_util_lookup_nid(proc))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return NULL; + } + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:cm: proc %s is on host %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + nid->name)); + + return nid->name; } static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc) { - orte_proc_t *pdata; + orte_pmap_t *pmap; - if (NULL == (pdata = find_proc(proc))) { + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_LOCAL_RANK_INVALID; - } + } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:hnp: proc %s has local rank %d", + "%s ess:cm: proc %s has local rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), - (int)pdata->local_rank)); + (int)pmap->local_rank)); - return pdata->local_rank; + return pmap->local_rank; } static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc) { - orte_proc_t *pdata; + orte_pmap_t *pmap; - if (NULL == (pdata = find_proc(proc))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_NODE_RANK_INVALID; + /* is this me? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid && + proc->vpid == ORTE_PROC_MY_NAME->vpid) { + /* yes it is - since I am a daemon, it can only + * be zero + */ + return 0; } + if (NULL == (pmap = orte_util_lookup_pmap(proc))) { + return ORTE_NODE_RANK_INVALID; + } + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:hnp: proc %s has node rank %d", + "%s ess:cm: proc %s has node rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), - (int)pdata->node_rank)); + (int)pmap->node_rank)); - return pdata->node_rank; + return pmap->node_rank; } static int update_pidmap(opal_byte_object_t *bo) { - /* there is nothing to do here - the HNP can resolve - * all requests directly from its internal data. However, - * we do need to free the data in the byte object to - * be consistent with other modules - */ - if (NULL != bo && NULL != bo->bytes) { - free(bo->bytes); + int ret; + + OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, + "%s ess:cm: updating pidmap", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* build the pmap */ + if (ORTE_SUCCESS != (ret = orte_util_decode_pidmap(bo))) { + ORTE_ERROR_LOG(ret); } - return ORTE_SUCCESS; + + return ret; } static int update_nidmap(opal_byte_object_t *bo) { - /* there is nothing to do here - the HNP can resolve - * all requests directly from its internal data. However, - * we do need to free the data in the byte object to - * be consistent with other modules - */ - if (NULL != bo && NULL != bo->bytes) { - free(bo->bytes); - } - return ORTE_SUCCESS; + int rc; + /* decode the nidmap - the util will know what to do */ + if (ORTE_SUCCESS != (rc = orte_util_decode_nodemap(bo))) { + ORTE_ERROR_LOG(rc); + } + return rc; } + +/* support for setting name */ +static bool arrived = false; +static bool name_success = false; + +static void cbfunc(int channel, opal_buffer_t *buf, void *cbdata) +{ + int32_t n; + orte_process_name_t name, *nmptr; + int rc; + + /* ensure we default to failure */ + name_success = false; + + /* unpack the response */ + nmptr = &name; + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &nmptr, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + goto depart; + } + /* setup name */ + ORTE_PROC_MY_NAME->jobid = name.jobid; + ORTE_PROC_MY_NAME->vpid = name.vpid; + name_success = true; + +depart: + arrived = true; +} + +static int cm_set_name(void) +{ + int i, rc; + struct sockaddr_in if_addr; + char *ifnames[] = { + "ce", + "eth0", + "eth1", + NULL + }; + int32_t net, rack, slot, function; + int32_t addr; + opal_buffer_t buf; + orte_daemon_cmd_flag_t cmd; + + /* try constructing the name from the IP address - first, + * find an appropriate interface + */ + for (i=0; NULL != ifnames[i]; i++) { + if (ORTE_SUCCESS != (rc = opal_ifnametoaddr(ifnames[i], + (struct sockaddr*)&if_addr, + sizeof(struct sockaddr_in)))) { + continue; + } + addr = htonl(if_addr.sin_addr.s_addr); + opal_output(0, "IP address: %d.%d.%d.%d", OPAL_IF_FORMAT_ADDR(addr)); + + /* break address into sections */ + net = 0x000000FF & ((0xFF000000 & addr) >> 24); + rack = 0x000000FF & ((0x00FF0000 & addr) >> 16); + slot = 0x000000FF & ((0x0000FF00 & addr) >> 8); + function = 0x000000FF & addr; + + /* is this an appropriate interface to use */ + if (10 == net) { + /* set our vpid - add 1 to ensure it cannot be zero */ + ORTE_PROC_MY_NAME->vpid = (rack * mca_ess_cm_component.max_slots) + slot + function + 1; + /* set our jobid to 0 */ + ORTE_PROC_MY_NAME->jobid = 0; + return ORTE_SUCCESS; + } else if (192 == net && 168 == rack) { + /* just use function */ + ORTE_PROC_MY_NAME->vpid = function + 1; + /* set our jobid to 0 */ + ORTE_PROC_MY_NAME->jobid = 0; + return ORTE_SUCCESS; + } + } + + /* if we get here, then we didn't find a usable interface. + * use the reliable multicast system to contact the HNP and + * get a name + */ + OBJ_CONSTRUCT(&buf, opal_buffer_t); + cmd = ORTE_DAEMON_NAME_REQ_CMD; + opal_dss.pack(&buf, &cmd, 1, ORTE_DAEMON_CMD_T); + + /* set the recv to get the answer */ + if (ORTE_SUCCESS != (rc = orte_rmcast.recv_nb(ORTE_RMCAST_SYS_ADDR, cbfunc, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buf); + return rc; + } + /* send the request */ + if (ORTE_SUCCESS != (rc = orte_rmcast.send(ORTE_RMCAST_SYS_ADDR, &buf))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&buf); + return rc; + } + OBJ_DESTRUCT(&buf); + + /* wait for response */ + ORTE_PROGRESSED_WAIT(arrived, 0, 1); + + /* if we got a valid name, return success */ + if (name_success) { + return ORTE_SUCCESS; + } + return ORTE_ERR_NOT_FOUND; +} + diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index ed59881ced..34f7e774a5 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -72,6 +72,9 @@ typedef uint8_t orte_daemon_cmd_flag_t; /* request proc resource usage */ #define ORTE_DAEMON_TOP_CMD (orte_daemon_cmd_flag_t) 22 +/* bootstrap */ +#define ORTE_DAEMON_NAME_REQ_CMD (orte_daemon_cmd_flag_t) 23 +#define ORTE_DAEMON_CHECKIN_CMD (orte_daemon_cmd_flag_t) 24 /* * List object to locally store the process names and pids of diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index d6be0c6caa..18a0975c8f 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -204,6 +204,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = { &orted_launch_cmd, OPAL_CMD_LINE_TYPE_STRING, "A regular expression describing the job to be launched at startup" }, + { "orte", "daemon", "bootstrap", '\0', "bootstrap", "bootstrap", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bootstrap the connection to the HNP" }, + /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 9b51f19485..a689e2143e 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -56,6 +56,7 @@ bool orte_debug_daemons_file_flag = false; bool orte_leave_session_attached; bool orte_do_not_launch = false; bool orted_spin_flag = false; +bool orte_daemon_bootstrap = false; /* ORTE OOB port flags */ bool orte_static_ports = false; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 42f69194bc..23c049834d 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -513,6 +513,7 @@ ORTE_DECLSPEC extern bool orte_debug_daemons_file_flag; ORTE_DECLSPEC extern bool orte_leave_session_attached; ORTE_DECLSPEC extern bool orte_do_not_launch; ORTE_DECLSPEC extern bool orted_spin_flag; +ORTE_DECLSPEC extern bool orte_daemon_bootstrap; /* ORTE OOB port flags */ ORTE_DECLSPEC extern bool orte_static_ports; diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index f2c21596d9..5eb4a5fc70 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -37,12 +37,22 @@ #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" +static bool passed_thru = false; + int orte_register_params(void) { int value, tmp; char *strval, **params; uint16_t binding; + /* only go thru this once - mpirun calls it twice, which causes + * any error messages to show up twice + */ + if (passed_thru) { + return ORTE_SUCCESS; + } + passed_thru = true; + mca_base_param_reg_int_name("orte", "base_help_aggregate", "If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.", false, false, @@ -89,6 +99,11 @@ int orte_register_params(void) orte_debug_daemons_flag = true; } + mca_base_param_reg_int_name("orte", "daemon_bootstrap", + "Bootstrap the connection to the HNP", + false, false, (int)false, &value); + orte_daemon_bootstrap = OPAL_INT_TO_BOOL(value); + /* do we want session output left open? */ mca_base_param_reg_int_name("orte", "leave_session_attached", "Whether applications and/or daemons should leave their sessions "