diff --git a/src/mca/gpr/proxy/gpr_proxy_compound_cmd.c b/src/mca/gpr/proxy/gpr_proxy_compound_cmd.c index 874755a65b..c568bc2935 100644 --- a/src/mca/gpr/proxy/gpr_proxy_compound_cmd.c +++ b/src/mca/gpr/proxy/gpr_proxy_compound_cmd.c @@ -30,6 +30,7 @@ #include "util/proc_info.h" #include "mca/ns/ns_types.h" +#include "mca/errmgr/errmgr.h" #include "mca/oob/oob_types.h" #include "mca/rml/rml.h" @@ -58,12 +59,14 @@ int orte_gpr_proxy_begin_compound_cmd(void) orte_gpr_proxy_globals.compound_cmd = OBJ_NEW(orte_buffer_t); if (NULL == orte_gpr_proxy_globals.compound_cmd) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); orte_gpr_proxy_globals.compound_cmd_mode = false; return ORTE_ERR_OUT_OF_RESOURCE; } if (ORTE_SUCCESS != (rc = orte_dps.pack(orte_gpr_proxy_globals.compound_cmd, &command, 1, ORTE_GPR_CMD))) { + ORTE_ERROR_LOG(rc); orte_gpr_proxy_globals.compound_cmd_mode = false; OBJ_RELEASE(orte_gpr_proxy_globals.compound_cmd); return rc; @@ -97,8 +100,7 @@ int orte_gpr_proxy_exec_compound_cmd(void) orte_buffer_t *answer; orte_gpr_cmd_flag_t command; size_t n; - int rc; - int32_t response; + int rc, response; if (orte_gpr_proxy_globals.debug) { ompi_output(0, "[%d,%d,%d] transmitting compound command", @@ -109,17 +111,20 @@ int orte_gpr_proxy_exec_compound_cmd(void) rc = ORTE_SUCCESS; if (0 > orte_rml.send_buffer(orte_process_info.gpr_replica, orte_gpr_proxy_globals.compound_cmd, ORTE_RML_TAG_GPR, 0)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); rc = ORTE_ERR_COMM_FAILURE; goto CLEANUP; } answer = OBJ_NEW(orte_buffer_t); if (NULL == answer) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; goto CLEANUP; } if (0 > orte_rml.recv_buffer(orte_process_info.gpr_replica, answer, ORTE_RML_TAG_GPR)) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); rc = ORTE_ERR_COMM_FAILURE; goto CLEANUP; @@ -127,18 +132,22 @@ int orte_gpr_proxy_exec_compound_cmd(void) n = 1; if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &command, &n, ORTE_GPR_CMD))) { + ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); goto CLEANUP; } if (ORTE_GPR_COMPOUND_CMD != command) { + ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(answer); rc = ORTE_ERR_COMM_FAILURE; goto CLEANUP; } n = 1; - rc = orte_dps.unpack(answer, &response, &n, ORTE_INT32); + if (ORTE_SUCCESS != (rc = orte_dps.unpack(answer, &response, &n, ORTE_INT))) { + ORTE_ERROR_LOG(rc); + } if (ORTE_SUCCESS == rc) { rc = (int)response; diff --git a/src/mca/gpr/replica/api_layer/gpr_replica_compound_cmd_api.c b/src/mca/gpr/replica/api_layer/gpr_replica_compound_cmd_api.c index bd96277206..358bff1ff0 100644 --- a/src/mca/gpr/replica/api_layer/gpr_replica_compound_cmd_api.c +++ b/src/mca/gpr/replica/api_layer/gpr_replica_compound_cmd_api.c @@ -28,6 +28,7 @@ #include "util/proc_info.h" #include "mca/ns/ns_types.h" +#include "mca/errmgr/errmgr.h" #include "mca/gpr/replica/gpr_replica.h" #include "mca/gpr/replica/communications/gpr_replica_comm.h" @@ -58,12 +59,14 @@ int orte_gpr_replica_begin_compound_cmd(void) orte_gpr_replica_globals.compound_cmd = OBJ_NEW(orte_buffer_t); if (NULL == orte_gpr_replica_globals.compound_cmd) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); orte_gpr_replica_globals.compound_cmd_mode = false; return ORTE_ERR_OUT_OF_RESOURCE; } if (ORTE_SUCCESS != (rc = orte_dps.pack(orte_gpr_replica_globals.compound_cmd, &command, 1, ORTE_GPR_CMD))) { + ORTE_ERROR_LOG(rc); orte_gpr_replica_globals.compound_cmd_mode = false; OBJ_RELEASE(orte_gpr_replica_globals.compound_cmd); return rc; @@ -104,8 +107,10 @@ int orte_gpr_replica_exec_compound_cmd(void) OMPI_THREAD_LOCK(&orte_gpr_replica_globals.wait_for_compound_mutex); - rc = orte_gpr_replica_process_command_buffer(orte_gpr_replica_globals.compound_cmd, - NULL, &answer); + if (ORTE_SUCCESS != (rc = orte_gpr_replica_process_command_buffer(orte_gpr_replica_globals.compound_cmd, + NULL, &answer))) { + ORTE_ERROR_LOG(rc); + } orte_gpr_replica_globals.compound_cmd_mode = false; if (NULL != orte_gpr_replica_globals.compound_cmd) { /* shouldn't be any way this could be true, but just to be safe... */ diff --git a/src/mca/gpr/replica/communications/Makefile.am b/src/mca/gpr/replica/communications/Makefile.am index 07d3450ec4..d10fd76714 100644 --- a/src/mca/gpr/replica/communications/Makefile.am +++ b/src/mca/gpr/replica/communications/Makefile.am @@ -20,6 +20,7 @@ libmca_gpr_replica_comm_la_SOURCES = \ gpr_replica_recv_proxy_msgs.c \ gpr_replica_remote_msg.c \ gpr_replica_cmd_processor.c \ + gpr_replica_compound_cmd_cm.c \ gpr_replica_cleanup_cm.c \ gpr_replica_del_index_cm.c \ gpr_replica_dump_cm.c \ diff --git a/src/mca/gpr/replica/communications/gpr_replica_cmd_processor.c b/src/mca/gpr/replica/communications/gpr_replica_cmd_processor.c index bc9fde72a3..2ab113129b 100644 --- a/src/mca/gpr/replica/communications/gpr_replica_cmd_processor.c +++ b/src/mca/gpr/replica/communications/gpr_replica_cmd_processor.c @@ -40,6 +40,7 @@ int orte_gpr_replica_process_command_buffer(orte_buffer_t *input_buffer, orte_gpr_cmd_flag_t command; int rc, ret, rc2; size_t n; + bool compound_cmd=false; *output_buffer = OBJ_NEW(orte_buffer_t); @@ -61,6 +62,8 @@ int orte_gpr_replica_process_command_buffer(orte_buffer_t *input_buffer, if (orte_gpr_replica_globals.debug) { ompi_output(0, "\tcompound cmd"); } + + compound_cmd = true; break; @@ -274,6 +277,28 @@ int orte_gpr_replica_process_command_buffer(orte_buffer_t *input_buffer, n = 1; /* unpack a single command */ } /* end while */ + /* deal with compound cmds to ensure proper return values */ + if (compound_cmd) { + OBJ_RELEASE(answer); + *output_buffer = OBJ_NEW(orte_buffer_t); + if (NULL == *output_buffer) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + command = ORTE_GPR_COMPOUND_CMD; + if (ORTE_SUCCESS != (rc = orte_dps.pack(*output_buffer, (void*)&command, 1, ORTE_GPR_CMD))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + + ret = ORTE_SUCCESS; + if (ORTE_SUCCESS != (rc = orte_dps.pack(*output_buffer, &ret, 1, ORTE_INT))) { + ORTE_ERROR_LOG(rc); + goto RETURN_ERROR; + } + } + return ORTE_SUCCESS; RETURN_ERROR: diff --git a/src/mca/gpr/replica/communications/gpr_replica_comm.h b/src/mca/gpr/replica/communications/gpr_replica_comm.h index 5ac69dae4f..8af59f19d9 100644 --- a/src/mca/gpr/replica/communications/gpr_replica_comm.h +++ b/src/mca/gpr/replica/communications/gpr_replica_comm.h @@ -72,6 +72,8 @@ int orte_gpr_replica_remote_notify(orte_process_name_t *recipient, int recipient /* * define the local functions for processing commands */ +int orte_gpr_replica_recv_compound_cmd(orte_buffer_t *output_buffer); + int orte_gpr_replica_recv_delete_segment_cmd(orte_buffer_t *input_buffer, orte_buffer_t *output_buffer); diff --git a/src/mca/gpr/replica/communications/gpr_replica_compound_cmd_cm.c b/src/mca/gpr/replica/communications/gpr_replica_compound_cmd_cm.c new file mode 100644 index 0000000000..cfa44aa7cd --- /dev/null +++ b/src/mca/gpr/replica/communications/gpr_replica_compound_cmd_cm.c @@ -0,0 +1,40 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** @file: + * + * The Open MPI General Purpose Registry - Replica component + * + */ + +/* + * includes + */ +#include "orte_config.h" + +#include "dps/dps.h" +#include "mca/errmgr/errmgr.h" + +#include "gpr_replica_comm.h" + +int orte_gpr_replica_recv_compound_cmd(orte_buffer_t *output_buffer) +{ + orte_gpr_cmd_flag_t command=ORTE_GPR_COMPOUND_CMD; + int rc; + + if (ORTE_SUCCESS != (rc = orte_dps.pack(output_buffer, &command, 1, ORTE_GPR_CMD))) { + ORTE_ERROR_LOG(rc); + } + return rc; +} diff --git a/src/mpi/runtime/ompi_mpi_init.c b/src/mpi/runtime/ompi_mpi_init.c index 46b02f43ac..bbfbb41a14 100644 --- a/src/mpi/runtime/ompi_mpi_init.c +++ b/src/mpi/runtime/ompi_mpi_init.c @@ -78,19 +78,41 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) int ret, param; bool allow_multi_user_threads; bool have_hidden_threads; + bool compound_cmd=false; ompi_proc_t** procs; size_t nprocs; char *error = NULL; - /* Join the run-time environment */ - if (ORTE_SUCCESS != (ret = orte_init())) { - goto error; + /* Join the run-time environment - do the things that don't hit the registry */ + if (ORTE_SUCCESS != (ret = orte_init_stage1())) { + error = "ompi_mpi_init: orte_init_stage1 failed"; + goto error; } - /* start recording the compound command that starts us up */ - /* orte_gpr.begin_compound_cmd(); - */ + /* if we are not the seed nor a singleton, AND we have not set the + * orte_debug flag, then + * start recording the compound command that starts us up. + * if we are the seed or a singleton, then don't do this - the registry is + * local, so we'll just drive it directly */ + if (orte_process_info.seed || + NULL == orte_process_info.ns_replica || + orte_debug_flag) { + compound_cmd = false; + } else { + if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) { + ORTE_ERROR_LOG(ret); + error = "ompi_mpi_init: orte_gpr.begin_compound_cmd failed"; + goto error; + } + compound_cmd = true; + } + /* Now do the things that hit the registry */ + if (ORTE_SUCCESS != (ret = orte_init_stage2())) { + ORTE_ERROR_LOG(ret); + error = "ompi_mpi_init: orte_init_stage2 failed"; + goto error; + } /* Once we've joined the RTE, see if any MCA parameters were passed to the MPI level */ @@ -242,13 +264,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) goto error; } - /* execute the compound command + /* if the compound command is operative, execute it */ -/* if (OMPI_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) { - error = "ompi_rte_init: orte_gpr.exec_compound_cmd failed"; - goto error; + if (compound_cmd) { + if (OMPI_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) { + ORTE_ERROR_LOG(ret); + error = "ompi_rte_init: orte_gpr.exec_compound_cmd failed"; + goto error; + } } -*/ + /* FIRST BARRIER - WAIT FOR MSG FROM RMGR_PROC_STAGE_GATE_MGR TO ARRIVE */ if (ORTE_SUCCESS != (ret = orte_rml.xcast(NULL, NULL, 0, NULL, NULL))) { diff --git a/src/runtime/Makefile.am b/src/runtime/Makefile.am index 2b691e2a7d..2004edb6f6 100644 --- a/src/runtime/Makefile.am +++ b/src/runtime/Makefile.am @@ -34,6 +34,8 @@ libruntime_la_SOURCES = \ ompi_progress.c \ orte_finalize.c \ orte_init.c \ + orte_init_stage1.c \ + orte_init_stage2.c \ orte_monitor.c \ orte_universe_exists.c \ orte_restart.c \ diff --git a/src/runtime/orte_init.c b/src/runtime/orte_init.c index d45dde76c8..e441d6d38b 100644 --- a/src/runtime/orte_init.c +++ b/src/runtime/orte_init.c @@ -16,406 +16,35 @@ #include "orte_config.h" -#include -#include - -#include "include/constants.h" -#include "event/event.h" -#include "util/output.h" -#include "threads/mutex.h" -#include "dps/dps.h" -#include "mca/mca.h" -#include "mca/base/base.h" -#include "mca/base/mca_base_param.h" -#include "mca/rml/base/base.h" -#include "mca/errmgr/base/base.h" -#include "mca/iof/base/base.h" -#include "mca/ns/base/base.h" -#include "mca/gpr/base/base.h" -#include "mca/rmgr/base/base.h" -#include "mca/rmaps/base/base.h" -#include "mca/soh/base/base.h" -#include "util/univ_info.h" -#include "util/proc_info.h" -#include "util/session_dir.h" -#include "util/sys_info.h" -#include "util/cmd_line.h" +#include "include/orte_constants.h" +#include "mca/errmgr/errmgr.h" #include "runtime/runtime.h" -#include "runtime/runtime_internal.h" -#include "runtime/orte_wait.h" /** * Initialze and setup a process in the ORTE. * * @retval ORTE_SUCCESS Upon success. * @retval ORTE_ERROR Upon failure. - * - * This function performs - * - * Just a note for developer: - - * So there are 3 ways in which an application can be started - * 1) rte_boot, followed by mpirun - * 2) mpirun (alone) - * 3) singleton (./a.out) - * - * Case 1) If the rte has already been booted, then mpirun will accept - * an optional command line parameter --universe=[rte universe name] - * which says which universe this application wants to be a part - * of. mpirun will then package this universe name and send it to the - * processes it will be starting off (fork/exec) on local or remote - * node.The packaging mechanism can be either command line parameter - * to the a.out it forks or make it part of environment - * (implementation dependent). - * - * Case 2) When mpirun is done alone and no universe is present, then - * the mpirun starts off the universe (using rte_boot), then - * fork/execs the processes, passin g along the [universe_name]. - * - * Case 3) For a singleton, if there is alrady an existing rte - * universe which it wants to join, it can specify that using the - * --universe command line. So it will do - * - * $ ./a.out --universe=[universe_name] - * - * In this case, MPI_Init will have to be called as MPI_Init(&argc, &argv) - - * If it does not want to join any existing rte, then it just starts - * off as ./a.out with no command line option. In that case, MPI_Init - * does not necesaarily needs to passed argc and argv. Infact if argc - * and argv are not passed or just have one entry (the command name), - * then MPI_Init would assume that new rte universe needs to be - * started up. - * - * - * MPI_Init() will look at its argc, argv. If it find the universe - * name there, fine. Else it looks at the environment variables for - * universe_name. If it finds there, fine again. Under such - * conditions, it joins the existing rte universe. If no universe - * name is found, it calls rte_boot to start off a new rte universe. - * - * For singleton, MPI_Init() do: - * - * if (I am a singleton) and (there is no universe) - * do rte_boot - * - * But if I am not a singleton, then I have been started by mpirun and - * already provided a universe_name to join. So I wont ever start a - * universe under such conditons. mpirun will pass me the - * universe_name (either mpirun would have started the universe, or it - * would have been already started by rte_boot) */ /* globals used by RTE */ -int orte_debug_flag=0; +int orte_debug_flag=(int)false; int orte_init(void) { - int ret; - char *universe; - char *jobid_str = NULL; - char *procid_str = NULL; - pid_t pid; + int rc; - /* Open up the output streams */ - if (!ompi_output_init()) { - return OMPI_ERROR; - } - - /* - * If threads are supported - assume that we are using threads - and reset otherwise. - */ - ompi_set_using_threads(OMPI_HAVE_THREAD_SUPPORT); - - /* For malloc debugging */ - ompi_malloc_init(); - - /* Ensure the universe_info structure is instantiated and initialized */ - if (ORTE_SUCCESS != (ret = orte_univ_info())) { - return ret; - } - - /* Ensure the system_info structure is instantiated and initialized */ - if (ORTE_SUCCESS != (ret = orte_sys_info())) { - return ret; - } - - /* Ensure the process info structure is instantiated and initialized */ - if (ORTE_SUCCESS != (ret = orte_proc_info())) { - return ret; + if (ORTE_SUCCESS != (rc = orte_init_stage1())) { + ORTE_ERROR_LOG(rc); + return rc; } - /* - * Initialize the MCA framework - */ - if (OMPI_SUCCESS != (ret = mca_base_open())) { - return ret; - } - - /* - * Open the name services to ensure access to local functions - */ - if (OMPI_SUCCESS != (ret = orte_ns_base_open())) { - return ret; - } - - /* Open the error manager to activate error logging - needs local name services */ - if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { - return ret; + if (ORTE_SUCCESS != (rc = orte_init_stage2())) { + ORTE_ERROR_LOG(rc); + return rc; } - /***** ERROR LOGGING NOW AVAILABLE *****/ - - /* check for debug flag */ - if (0 > (ret = mca_base_param_register_int("orte", "debug", NULL, NULL, 0))) { - ORTE_ERROR_LOG(ret); - return ret; - } - if (ORTE_SUCCESS != (ret = mca_base_param_lookup_int(ret, &orte_debug_flag))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Initialize the event library - */ - if (OMPI_SUCCESS != (ret = ompi_event_init())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Internal startup - */ - if (OMPI_SUCCESS != (ret = orte_wait_init())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Initialize the data packing service. - */ - if (ORTE_SUCCESS != (ret = orte_dps_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Runtime Messaging Layer - */ - if (OMPI_SUCCESS != (ret = orte_rml_base_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Runtime Messaging Layer - */ - if (OMPI_SUCCESS != (ret = orte_rml_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Registry - */ - if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Initialize schema utilities - */ - - if (ORTE_SUCCESS != (ret = orte_schema_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* check for existing universe to join */ - if (ORTE_SUCCESS != (ret = orte_universe_exists())) { - if (orte_debug_flag) { - ompi_output(0, "orte_init: could not join existing universe"); - } - if (ORTE_ERR_NOT_FOUND != ret) { - /* if it exists but no contact could be established, - * define unique name based on current one. - * and start new universe with me as seed - */ - universe = strdup(orte_universe_info.name); - free(orte_universe_info.name); - orte_universe_info.name = NULL; - pid = getpid(); - if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { - ompi_output(0, "orte_init: failed to create unique universe name"); - return ret; - } - } - - orte_process_info.seed = true; - if (NULL != orte_process_info.ns_replica) { - free(orte_process_info.ns_replica); - orte_process_info.ns_replica = NULL; - } - if (NULL != orte_process_info.gpr_replica) { - free(orte_process_info.gpr_replica); - orte_process_info.gpr_replica = NULL; - } - } - - /* - * Name Server - */ - if (OMPI_SUCCESS != (ret = orte_ns_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Registry - */ - if (ORTE_SUCCESS != (ret = orte_gpr_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /***** SET MY NAME *****/ - if (ORTE_SUCCESS != (ret = orte_ns.set_my_name())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* setup my session directory */ - if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) { - ORTE_ERROR_LOG(ret); - return ret; - } - if (ORTE_SUCCESS != (ret = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (orte_debug_flag) { - ompi_output(0, "[%d,%d,%d] setting up session dir with", - ORTE_NAME_ARGS(orte_process_info.my_name)); - if (NULL != orte_process_info.tmpdir_base) { - ompi_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base); - } - ompi_output(0, "\tuniverse %s", orte_universe_info.name); - ompi_output(0, "\tuser %s", orte_system_info.user); - ompi_output(0, "\thost %s", orte_system_info.nodename); - ompi_output(0, "\tjobid %s", jobid_str); - ompi_output(0, "\tprocid %s", procid_str); - } - if (ORTE_SUCCESS != (ret = orte_session_dir(true, - orte_process_info.tmpdir_base, - orte_system_info.user, - orte_system_info.nodename, NULL, - orte_universe_info.name, - jobid_str, procid_str))) { - if (jobid_str != NULL) free(jobid_str); - if (procid_str != NULL) free(procid_str); - ORTE_ERROR_LOG(ret); - return ret; - } - if (NULL != jobid_str) { - free(jobid_str); - } - if (NULL != procid_str) { - free(procid_str); - } - - /* set contact info for ns/gpr */ - if(NULL != orte_process_info.ns_replica_uri) { - orte_rml.set_uri(orte_process_info.ns_replica_uri); - } - if(NULL != orte_process_info.gpr_replica_uri) { - orte_rml.set_uri(orte_process_info.gpr_replica_uri); - } - - /* open/load rmgr/soh */ - - if (ORTE_SUCCESS != (ret = orte_rmgr_base_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (ORTE_SUCCESS != (ret = orte_soh_base_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* setup jobid-0 */ - - if(orte_process_info.seed) { - if (ORTE_SUCCESS != (ret = orte_rmgr_base_set_job_slots(0,1))) { - ORTE_ERROR_LOG(ret); - return ret; - } - if (ORTE_SUCCESS != (ret = orte_rmaps_base_set_vpid_range(0,0,1))) { - ORTE_ERROR_LOG(ret); - return ret; - } - if (ORTE_SUCCESS != (ret = orte_rmgr_base_proc_stage_gate_init(0))) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - /* - * Initialize the selected modules now that all components/name are available. - */ - - if (ORTE_SUCCESS != (ret = orte_rml.init())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (ORTE_SUCCESS != (ret = orte_ns.init())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (ORTE_SUCCESS != (ret = orte_gpr.init())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * setup the resource manager - */ - - if (ORTE_SUCCESS != (ret = orte_rmgr_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * setup the state-of-health monitor - */ - if (ORTE_SUCCESS != (ret = orte_soh_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * setup I/O forwarding system - */ - if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { - ORTE_ERROR_LOG(ret); - return ret; - } - if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * All done - */ - return ORTE_SUCCESS; } diff --git a/src/runtime/orte_init_stage1.c b/src/runtime/orte_init_stage1.c new file mode 100644 index 0000000000..a2e7fa14fa --- /dev/null +++ b/src/runtime/orte_init_stage1.c @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file **/ + +#include "orte_config.h" + +#include +#include + +#include "include/constants.h" +#include "event/event.h" +#include "util/output.h" +#include "threads/mutex.h" +#include "dps/dps.h" +#include "mca/mca.h" +#include "mca/base/base.h" +#include "mca/base/mca_base_param.h" +#include "mca/rml/base/base.h" +#include "mca/errmgr/base/base.h" +#include "mca/iof/base/base.h" +#include "mca/ns/base/base.h" +#include "mca/gpr/base/base.h" +#include "mca/rmgr/base/base.h" +#include "mca/rmaps/base/base.h" +#include "mca/soh/base/base.h" +#include "util/univ_info.h" +#include "util/proc_info.h" +#include "util/session_dir.h" +#include "util/sys_info.h" +#include "util/cmd_line.h" + +#include "runtime/runtime.h" +#include "runtime/runtime_internal.h" +#include "runtime/orte_wait.h" + +int orte_init_stage1(void) +{ + int ret; + char *universe; + char *jobid_str = NULL; + char *procid_str = NULL; + pid_t pid; + + /* Open up the output streams */ + if (!ompi_output_init()) { + return OMPI_ERROR; + } + + /* + * If threads are supported - assume that we are using threads - and reset otherwise. + */ + ompi_set_using_threads(OMPI_HAVE_THREAD_SUPPORT); + + /* For malloc debugging */ + ompi_malloc_init(); + + /* Ensure the universe_info structure is instantiated and initialized */ + if (ORTE_SUCCESS != (ret = orte_univ_info())) { + return ret; + } + + /* Ensure the system_info structure is instantiated and initialized */ + if (ORTE_SUCCESS != (ret = orte_sys_info())) { + return ret; + } + + /* Ensure the process info structure is instantiated and initialized */ + if (ORTE_SUCCESS != (ret = orte_proc_info())) { + return ret; + } + + /* + * Initialize the MCA framework + */ + if (OMPI_SUCCESS != (ret = mca_base_open())) { + return ret; + } + + /* + * Open the name services to ensure access to local functions + */ + if (OMPI_SUCCESS != (ret = orte_ns_base_open())) { + return ret; + } + + /* Open the error manager to activate error logging - needs local name services */ + if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { + return ret; + } + + /***** ERROR LOGGING NOW AVAILABLE *****/ + + /* check for debug flag */ + if (0 > (ret = mca_base_param_register_int("orte", "debug", NULL, NULL, 0))) { + ORTE_ERROR_LOG(ret); + return ret; + } + if (ORTE_SUCCESS != (ret = mca_base_param_lookup_int(ret, &orte_debug_flag))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Initialize the event library + */ + if (OMPI_SUCCESS != (ret = ompi_event_init())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Internal startup + */ + if (OMPI_SUCCESS != (ret = orte_wait_init())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Initialize the data packing service. + */ + if (ORTE_SUCCESS != (ret = orte_dps_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Runtime Messaging Layer + */ + if (OMPI_SUCCESS != (ret = orte_rml_base_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Runtime Messaging Layer + */ + if (OMPI_SUCCESS != (ret = orte_rml_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Registry + */ + if (ORTE_SUCCESS != (ret = orte_gpr_base_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Initialize schema utilities + */ + + if (ORTE_SUCCESS != (ret = orte_schema_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* check for existing universe to join */ + if (ORTE_SUCCESS != (ret = orte_universe_exists())) { + if (orte_debug_flag) { + ompi_output(0, "orte_init: could not join existing universe"); + } + if (ORTE_ERR_NOT_FOUND != ret) { + /* if it exists but no contact could be established, + * define unique name based on current one. + * and start new universe with me as seed + */ + universe = strdup(orte_universe_info.name); + free(orte_universe_info.name); + orte_universe_info.name = NULL; + pid = getpid(); + if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { + ompi_output(0, "orte_init: failed to create unique universe name"); + return ret; + } + } + + orte_process_info.seed = true; + if (NULL != orte_process_info.ns_replica) { + free(orte_process_info.ns_replica); + orte_process_info.ns_replica = NULL; + } + if (NULL != orte_process_info.gpr_replica) { + free(orte_process_info.gpr_replica); + orte_process_info.gpr_replica = NULL; + } + } + + /* + * Name Server + */ + if (OMPI_SUCCESS != (ret = orte_ns_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * Registry + */ + if (ORTE_SUCCESS != (ret = orte_gpr_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /***** SET MY NAME *****/ + if (ORTE_SUCCESS != (ret = orte_ns.set_my_name())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* setup my session directory */ + if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) { + ORTE_ERROR_LOG(ret); + return ret; + } + if (ORTE_SUCCESS != (ret = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (orte_debug_flag) { + ompi_output(0, "[%d,%d,%d] setting up session dir with", + ORTE_NAME_ARGS(orte_process_info.my_name)); + if (NULL != orte_process_info.tmpdir_base) { + ompi_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base); + } + ompi_output(0, "\tuniverse %s", orte_universe_info.name); + ompi_output(0, "\tuser %s", orte_system_info.user); + ompi_output(0, "\thost %s", orte_system_info.nodename); + ompi_output(0, "\tjobid %s", jobid_str); + ompi_output(0, "\tprocid %s", procid_str); + } + if (ORTE_SUCCESS != (ret = orte_session_dir(true, + orte_process_info.tmpdir_base, + orte_system_info.user, + orte_system_info.nodename, NULL, + orte_universe_info.name, + jobid_str, procid_str))) { + if (jobid_str != NULL) free(jobid_str); + if (procid_str != NULL) free(procid_str); + ORTE_ERROR_LOG(ret); + return ret; + } + if (NULL != jobid_str) { + free(jobid_str); + } + if (NULL != procid_str) { + free(procid_str); + } + + /* set contact info for ns/gpr */ + if(NULL != orte_process_info.ns_replica_uri) { + orte_rml.set_uri(orte_process_info.ns_replica_uri); + } + if(NULL != orte_process_info.gpr_replica_uri) { + orte_rml.set_uri(orte_process_info.gpr_replica_uri); + } + + /* open/load rmgr/soh */ + + if (ORTE_SUCCESS != (ret = orte_rmgr_base_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (ORTE_SUCCESS != (ret = orte_soh_base_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* setup jobid-0 */ + + if(orte_process_info.seed) { + if (ORTE_SUCCESS != (ret = orte_rmgr_base_set_job_slots(0,1))) { + ORTE_ERROR_LOG(ret); + return ret; + } + if (ORTE_SUCCESS != (ret = orte_rmaps_base_set_vpid_range(0,0,1))) { + ORTE_ERROR_LOG(ret); + return ret; + } + if (ORTE_SUCCESS != (ret = orte_rmgr_base_proc_stage_gate_init(0))) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + return ORTE_SUCCESS; +} diff --git a/src/runtime/orte_init_stage2.c b/src/runtime/orte_init_stage2.c new file mode 100644 index 0000000000..d9cacff33b --- /dev/null +++ b/src/runtime/orte_init_stage2.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file **/ + +#include "orte_config.h" + +#include "include/orte_constants.h" + +#include "mca/errmgr/errmgr.h" + +#include "mca/rml/rml.h" +#include "mca/ns/ns.h" +#include "mca/gpr/gpr.h" + +#include "mca/rmgr/base/base.h" +#include "mca/soh/base/base.h" +#include "mca/iof/base/base.h" + +#include "runtime/runtime.h" + +int orte_init_stage2(void) +{ + int ret; + + /* + * Initialize the selected modules now that all components/name are available. + */ + + if (ORTE_SUCCESS != (ret = orte_rml.init())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (ORTE_SUCCESS != (ret = orte_ns.init())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (ORTE_SUCCESS != (ret = orte_gpr.init())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * setup the resource manager + */ + + if (ORTE_SUCCESS != (ret = orte_rmgr_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * setup the state-of-health monitor + */ + if (ORTE_SUCCESS != (ret = orte_soh_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * setup I/O forwarding system + */ + if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { + ORTE_ERROR_LOG(ret); + return ret; + } + if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * All done + */ + + return ORTE_SUCCESS; +} diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index 208d89b6b3..d8164020ee 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -94,6 +94,9 @@ OMPI_DECLSPEC int orte_abort(int status, char *fmt, ...); */ OMPI_DECLSPEC int orte_init(void); +OMPI_DECLSPEC int orte_init_stage1(void); +OMPI_DECLSPEC int orte_init_stage2(void); + /** * Re-init the Open run time environment. * diff --git a/test/Unit-Test-Status.pdf b/test/Unit-Test-Status.pdf new file mode 100644 index 0000000000..cb6675cc96 Binary files /dev/null and b/test/Unit-Test-Status.pdf differ diff --git a/test/Unit-Test-Status.xls b/test/Unit-Test-Status.xls new file mode 100644 index 0000000000..c1d577d65f Binary files /dev/null and b/test/Unit-Test-Status.xls differ