diff --git a/ompi/communicator/comm_dyn.c b/ompi/communicator/comm_dyn.c index 30beec0cac..72d63f5842 100644 --- a/ompi/communicator/comm_dyn.c +++ b/ompi/communicator/comm_dyn.c @@ -533,7 +533,7 @@ ompi_comm_start_processes(int count, char **array_of_commands, if (NULL != base_prefix) free(base_prefix); /* spawn procs */ - if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, NULL, ORTE_PROC_STATE_NONE))) { + if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, 0, NULL, NULL, ORTE_PROC_STATE_NONE))) { ORTE_ERROR_LOG(rc); opal_progress_event_decrement(); return MPI_ERR_SPAWN; diff --git a/orte/mca/rmgr/base/Makefile.am b/orte/mca/rmgr/base/Makefile.am index 640171d733..e09f5b2e22 100644 --- a/orte/mca/rmgr/base/Makefile.am +++ b/orte/mca/rmgr/base/Makefile.am @@ -23,6 +23,7 @@ headers += \ libmca_rmgr_la_SOURCES += \ base/rmgr_base_check_context.c \ base/rmgr_base_context.c \ + base/rmgr_base_con_discon.c \ base/rmgr_base_close.c \ base/rmgr_base_open.c \ base/rmgr_base_receive.c \ diff --git a/orte/mca/rmgr/base/rmgr_base_con_discon.c b/orte/mca/rmgr/base/rmgr_base_con_discon.c new file mode 100644 index 0000000000..55bce7bf25 --- /dev/null +++ b/orte/mca/rmgr/base/rmgr_base_con_discon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + * Support functions for the RMGR subsystem + */ + +#include "orte_config.h" +#include "orte/orte_constants.h" + +#include + +#include "opal/util/output.h" +#include "opal/class/opal_list.h" + +#include "orte/dss/dss.h" +#include "orte/mca/schema/schema.h" +#include "orte/mca/gpr/gpr.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmgr/base/rmgr_private.h" + +int orte_rmgr_base_connect(orte_std_cntr_t num_connect, + orte_process_name_t *connect) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + +int orte_rmgr_base_disconnect(orte_std_cntr_t num_disconnect, + orte_process_name_t *disconnect) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + diff --git a/orte/mca/rmgr/base/rmgr_base_open.c b/orte/mca/rmgr/base/rmgr_base_open.c index 623d55618b..03ae7ef2d5 100644 --- a/orte/mca/rmgr/base/rmgr_base_open.c +++ b/orte/mca/rmgr/base/rmgr_base_open.c @@ -53,6 +53,8 @@ orte_rmgr_base_module_t orte_rmgr = { NULL, orte_rmgr_base_create_not_available, orte_rmgr_base_spawn_not_available, + orte_rmgr_base_connect, + orte_rmgr_base_disconnect, orte_rmgr_base_finalize_not_available, /** SUPPORT FUNCTIONS ***/ orte_rmgr_base_get_app_context, diff --git a/orte/mca/rmgr/base/rmgr_base_receive.c b/orte/mca/rmgr/base/rmgr_base_receive.c index b12e033980..5a79372e18 100644 --- a/orte/mca/rmgr/base/rmgr_base_receive.c +++ b/orte/mca/rmgr/base/rmgr_base_receive.c @@ -178,7 +178,8 @@ void orte_rmgr_base_recv(int status, orte_process_name_t* sender, } /* process the request */ - if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(context, num_context, &job, NULL, ORTE_PROC_STATE_NONE))) { + if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(context, num_context, &job, + 0, NULL, NULL, ORTE_PROC_STATE_NONE))) { ORTE_ERROR_LOG(rc); goto SEND_ANSWER; } diff --git a/orte/mca/rmgr/base/rmgr_base_stubs.c b/orte/mca/rmgr/base/rmgr_base_stubs.c index 62412ebf02..2f3971f351 100644 --- a/orte/mca/rmgr/base/rmgr_base_stubs.c +++ b/orte/mca/rmgr/base/rmgr_base_stubs.c @@ -43,6 +43,8 @@ orte_rmgr_base_spawn_not_available( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions) { diff --git a/orte/mca/rmgr/base/rmgr_private.h b/orte/mca/rmgr/base/rmgr_private.h index 35eb7ac274..52a15cea78 100644 --- a/orte/mca/rmgr/base/rmgr_private.h +++ b/orte/mca/rmgr/base/rmgr_private.h @@ -83,12 +83,22 @@ int orte_rmgr_base_create_not_available( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid); + int orte_rmgr_base_spawn_not_available( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); + +int orte_rmgr_base_connect(orte_std_cntr_t num_connect, + orte_process_name_t *connect); + +int orte_rmgr_base_disconnect(orte_std_cntr_t num_disconnect, + orte_process_name_t *disconnect); + int orte_rmgr_base_finalize_not_available(void); /* diff --git a/orte/mca/rmgr/cnos/rmgr_cnos.c b/orte/mca/rmgr/cnos/rmgr_cnos.c index 4ac93989f7..cef9b3f2c4 100644 --- a/orte/mca/rmgr/cnos/rmgr_cnos.c +++ b/orte/mca/rmgr/cnos/rmgr_cnos.c @@ -46,9 +46,17 @@ static int orte_rmgr_cnos_spawn_job( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); +static int orte_rmgr_cnos_connect(orte_std_cntr_t num_connect, + orte_process_name_t *connect); + +static int orte_rmgr_cnos_disconnect(orte_std_cntr_t num_connect, + orte_process_name_t *connect); + static int orte_rmgr_cnos_finalize(void); static int orte_rmgr_cnos_get_app_context(orte_jobid_t jobid, @@ -76,6 +84,8 @@ orte_rmgr_base_module_t orte_rmgr_cnos_module = { NULL, /* don't need special init */ orte_rmgr_cnos_setup_job, orte_rmgr_cnos_spawn_job, + orte_rmgr_cnos_connect, + orte_rmgr_cnos_disconnect, orte_rmgr_cnos_finalize, /** SUPPORT FUNCTIONS ***/ orte_rmgr_cnos_get_app_context, @@ -109,6 +119,18 @@ static int orte_rmgr_cnos_spawn_job( return ORTE_ERR_NOT_SUPPORTED; } +static int orte_rmgr_cnos_connect(orte_std_cntr_t num_connect, + orte_process_name_t *connect) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + +static int orte_rmgr_cnos_disconnect(orte_std_cntr_t num_connect, + orte_process_name_t *connect) +{ + return ORTE_ERR_NOT_SUPPORTED; +} + static int orte_rmgr_cnos_finalize(void) { diff --git a/orte/mca/rmgr/cnos/rmgr_cnos_component.c b/orte/mca/rmgr/cnos/rmgr_cnos_component.c index 89d2b3092a..502d6c1448 100644 --- a/orte/mca/rmgr/cnos/rmgr_cnos_component.c +++ b/orte/mca/rmgr/cnos/rmgr_cnos_component.c @@ -45,10 +45,10 @@ orte_rmgr_base_component_t mca_rmgr_cnos_component = { information about the component itself */ { - /* Indicate that we are a rmgr v1.3.0 component (which also + /* Indicate that we are a rmgr v2.0.0 component (which also implies a specific MCA version) */ - ORTE_RMGR_BASE_VERSION_1_3_0, + ORTE_RMGR_BASE_VERSION_2_0_0, "cnos", /* MCA component name */ ORTE_MAJOR_VERSION, /* MCA component major version */ diff --git a/orte/mca/rmgr/proxy/rmgr_proxy.c b/orte/mca/rmgr/proxy/rmgr_proxy.c index 722de66c29..cb3a92e43a 100644 --- a/orte/mca/rmgr/proxy/rmgr_proxy.c +++ b/orte/mca/rmgr/proxy/rmgr_proxy.c @@ -48,6 +48,8 @@ static int orte_rmgr_proxy_spawn_job( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); @@ -55,6 +57,8 @@ orte_rmgr_base_module_t orte_rmgr_proxy_module = { NULL, /* don't need special init */ orte_rmgr_proxy_setup_job, orte_rmgr_proxy_spawn_job, + orte_rmgr_base_connect, + orte_rmgr_base_disconnect, NULL, /* finalize */ /** SUPPORT FUNCTIONS ***/ orte_rmgr_base_get_app_context, @@ -269,6 +273,8 @@ static int orte_rmgr_proxy_spawn_job( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfunc, orte_proc_state_t cb_conditions) { diff --git a/orte/mca/rmgr/proxy/rmgr_proxy_component.c b/orte/mca/rmgr/proxy/rmgr_proxy_component.c index 23717e384f..176cc668a0 100644 --- a/orte/mca/rmgr/proxy/rmgr_proxy_component.c +++ b/orte/mca/rmgr/proxy/rmgr_proxy_component.c @@ -41,10 +41,10 @@ orte_rmgr_proxy_component_t mca_rmgr_proxy_component = { information about the component itself */ { - /* Indicate that we are a iof v1.0.0 component (which also + /* Indicate that we are a rmgr v2.0.0 component (which also implies a specific MCA version) */ - ORTE_RMGR_BASE_VERSION_1_3_0, + ORTE_RMGR_BASE_VERSION_2_0_0, "proxy", /* MCA component name */ ORTE_MAJOR_VERSION, /* MCA component major version */ diff --git a/orte/mca/rmgr/rmgr.h b/orte/mca/rmgr/rmgr.h index 4b944a7284..cd22552584 100644 --- a/orte/mca/rmgr/rmgr.h +++ b/orte/mca/rmgr/rmgr.h @@ -93,10 +93,33 @@ typedef int (*orte_rmgr_base_module_spawn_job_fn_t)( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t *jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); +/** + * Connect a process to other processes, possibly in other jobs. Note that this + * function supports WILDCARD process name fields. Thus, a process can request + * connection to all other processes in another job by providing a single + * entry in the connect array that has a cellid of ORTE_CELLID_WILDCARD, the + * desired jobid, and a vpid of ORTE_VPID_WILDCARD. + */ +typedef int (*orte_rmgr_base_module_connect_fn_t)(orte_std_cntr_t num_connect, + orte_process_name_t *connect); + +/** + * Disconnect a process from one or more other processes. Note that this + * function supports WILDCARD process name fields. Thus, a process can request + * to disconnect from all other processes in another job by providing a single + * entry in the connect array that has a cellid of ORTE_CELLID_WILDCARD, the + * desired jobid, and a vpid of ORTE_VPID_WILDCARD. + */ +typedef int (*orte_rmgr_base_module_disconnect_fn_t)(orte_std_cntr_t num_disconnect, + orte_process_name_t *disconnect); + + /** * Allow module-specific init. */ @@ -156,12 +179,14 @@ typedef int (*orte_rmgr_base_module_get_vpid_range_fn_t)(orte_jobid_t jobid, /* - * Ver 1.3.0 + * Ver 2.0 */ -struct orte_rmgr_base_module_1_3_0_t { +struct orte_rmgr_base_module_2_0_0_t { orte_rmgr_base_module_init_fn_t module_init; orte_rmgr_base_module_setup_job_fn_t setup_job; orte_rmgr_base_module_spawn_job_fn_t spawn_job; + orte_rmgr_base_module_connect_fn_t connect; + orte_rmgr_base_module_disconnect_fn_t disconnect; orte_rmgr_base_module_finalize_fn_t finalize; /** SUPPORT FUNCTIONS ***/ orte_rmgr_base_module_get_app_context_fn_t get_app_context; @@ -172,8 +197,8 @@ struct orte_rmgr_base_module_1_3_0_t { orte_rmgr_base_module_get_vpid_range_fn_t get_vpid_range; }; -typedef struct orte_rmgr_base_module_1_3_0_t orte_rmgr_base_module_1_3_0_t; -typedef orte_rmgr_base_module_1_3_0_t orte_rmgr_base_module_t; +typedef struct orte_rmgr_base_module_2_0_0_t orte_rmgr_base_module_2_0_0_t; +typedef orte_rmgr_base_module_2_0_0_t orte_rmgr_base_module_t; /* * RMGR Component @@ -187,24 +212,24 @@ typedef orte_rmgr_base_module_t* (*orte_rmgr_base_component_init_fn_t)( * the standard component data structure */ -struct orte_rmgr_base_component_1_3_0_t { +struct orte_rmgr_base_component_2_0_0_t { mca_base_component_t rmgr_version; mca_base_component_data_1_0_0_t rmgr_data; orte_rmgr_base_component_init_fn_t rmgr_init; }; -typedef struct orte_rmgr_base_component_1_3_0_t orte_rmgr_base_component_1_3_0_t; -typedef orte_rmgr_base_component_1_3_0_t orte_rmgr_base_component_t; +typedef struct orte_rmgr_base_component_2_0_0_t orte_rmgr_base_component_2_0_0_t; +typedef orte_rmgr_base_component_2_0_0_t orte_rmgr_base_component_t; /** - * Macro for use in components that are of type rmgr v1.0.0 + * Macro for use in components that are of type rmgr v2.0.0 */ -#define ORTE_RMGR_BASE_VERSION_1_3_0 \ - /* rmgr v1.0 is chained to MCA v1.0 */ \ +#define ORTE_RMGR_BASE_VERSION_2_0_0 \ + /* rmgr v2.0 is chained to MCA v1.0 */ \ MCA_BASE_VERSION_1_0_0, \ - /* rmgr v1.3 */ \ - "rmgr", 1, 3, 0 + /* rmgr v2.0 */ \ + "rmgr", 2, 0, 0 /** * Global structure for accessing RAS functions diff --git a/orte/mca/rmgr/urm/rmgr_urm.c b/orte/mca/rmgr/urm/rmgr_urm.c index 76537ac472..96a0253810 100644 --- a/orte/mca/rmgr/urm/rmgr_urm.c +++ b/orte/mca/rmgr/urm/rmgr_urm.c @@ -54,6 +54,8 @@ static int orte_rmgr_urm_spawn_job( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfn, orte_proc_state_t cb_conditions); @@ -66,6 +68,8 @@ orte_rmgr_base_module_t orte_rmgr_urm_module = { orte_rmgr_urm_module_init, orte_rmgr_urm_setup_job, orte_rmgr_urm_spawn_job, + orte_rmgr_base_connect, + orte_rmgr_base_disconnect, orte_rmgr_urm_module_finalize, /** SUPPORT FUNCTIONS ***/ orte_rmgr_base_get_app_context, @@ -264,6 +268,8 @@ static int orte_rmgr_urm_spawn_job( orte_app_context_t** app_context, orte_std_cntr_t num_context, orte_jobid_t* jobid, + orte_std_cntr_t num_connect, + orte_process_name_t *connect, orte_rmgr_cb_fn_t cbfunc, orte_proc_state_t cb_conditions) { diff --git a/orte/mca/rmgr/urm/rmgr_urm_component.c b/orte/mca/rmgr/urm/rmgr_urm_component.c index adbb83e8ec..8be72732a1 100644 --- a/orte/mca/rmgr/urm/rmgr_urm_component.c +++ b/orte/mca/rmgr/urm/rmgr_urm_component.c @@ -49,10 +49,10 @@ orte_rmgr_urm_component_t mca_rmgr_urm_component = { information about the component itself */ { - /* Indicate that we are a rmgr v1.3.0 component (which also + /* Indicate that we are a rmgr v2.0.0 component (which also implies a specific MCA version) */ - ORTE_RMGR_BASE_VERSION_1_3_0, + ORTE_RMGR_BASE_VERSION_2_0_0, "urm", /* MCA component name */ ORTE_MAJOR_VERSION, /* MCA component major version */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 1323bb007c..931b20a9d1 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -450,7 +450,7 @@ int orterun(int argc, char *argv[]) /* Spawn the job */ cb_states = ORTE_PROC_STATE_TERMINATED | ORTE_PROC_STATE_AT_STG1; - rc = orte_rmgr.spawn_job(apps, num_apps, &jobid, job_state_callback, cb_states); + rc = orte_rmgr.spawn_job(apps, num_apps, &jobid, 0, NULL, job_state_callback, cb_states); if (ORTE_SUCCESS != rc) { /* JMS show_help */ opal_output(0, "%s: spawn failed with errno=%d\n", orterun_basename, rc);