1
1
openmpi/src/mpi/runtime/ompi_mpi_init.c

409 строки
12 KiB
C
Исходник Обычный вид История

/*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/constants.h"
#include "mpi/runtime/mpiruntime.h"
#include "mpi/runtime/params.h"
#include "runtime/runtime.h"
#include "util/sys_info.h"
#include "util/proc_info.h"
2004-09-10 07:21:03 +04:00
#include "util/session_dir.h"
#include "mpi.h"
#include "communicator/communicator.h"
#include "group/group.h"
#include "info/info.h"
#include "util/show_help.h"
#include "errhandler/errhandler.h"
#include "errhandler/errcode.h"
#include "errhandler/errclass.h"
#include "errhandler/errcode-internal.h"
#include "op/op.h"
#include "file/file.h"
#include "mca/base/base.h"
#include "mca/base/base.h"
#include "mca/allocator/base/base.h"
#include "mca/allocator/allocator.h"
#include "mca/mpool/base/base.h"
#include "mca/mpool/mpool.h"
#include "mca/ptl/ptl.h"
#include "mca/ptl/base/base.h"
#include "mca/pml/pml.h"
#include "mca/pml/base/base.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/base.h"
#include "mca/topo/topo.h"
#include "mca/topo/base/base.h"
#include "mca/io/io.h"
#include "mca/io/base/base.h"
#include "mca/oob/base/base.h"
#include "mca/ns/base/base.h"
2004-09-23 18:35:02 +04:00
#include "mca/gpr/base/base.h"
#include "runtime/runtime.h"
/*
* Global variables and symbols for the MPI layer
*/
bool ompi_mpi_initialized = false;
bool ompi_mpi_finalized = false;
bool ompi_mpi_thread_multiple = false;
int ompi_mpi_thread_requested = MPI_THREAD_SINGLE;
int ompi_mpi_thread_provided = MPI_THREAD_SINGLE;
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
int ret, param;
mca_ns_base_jobid_t jobid;
mca_ns_base_vpid_t vpid;
bool allow_multi_user_threads;
bool have_hidden_threads;
ompi_proc_t** procs;
size_t nprocs;
2004-09-10 07:21:03 +04:00
char *error, *jobid_str, *procid_str;
2004-09-23 18:35:02 +04:00
char *universe, *contact_info;
pid_t pid;
2004-09-10 07:21:03 +04:00
/* Become an OMPI process */
if (OMPI_SUCCESS != (ret = ompi_init(argc, argv))) {
error = "ompi_init() failed";
goto error;
}
/* Open up the MCA */
if (OMPI_SUCCESS != (ret = mca_base_open())) {
error = "mca_base_open() failed";
goto error;
}
/* Join the run-time environment */
allow_multi_user_threads = true;
have_hidden_threads = false;
if (OMPI_SUCCESS != (ret = ompi_rte_init_stage1(&allow_multi_user_threads,
&have_hidden_threads))) {
2004-09-10 07:21:03 +04:00
goto error;
}
/* parse environmental variables and fill corresponding info structures */
ompi_rte_parse_environ();
/* check for existing universe to join */
if (OMPI_SUCCESS != (ret = ompi_rte_universe_exists())) {
if (ompi_rte_debug_flag) {
ompi_output(0, "ompi_mpi_init: could not join existing universe");
}
if (OMPI_ERR_NOT_FOUND != ret) {
/* if it exists but no contact could be established,
* define unique name based on current one.
* and start new universe with me as seed
*/
universe = strdup(ompi_universe_info.name);
free(ompi_universe_info.name);
ompi_universe_info.name = NULL;
pid = getpid();
if (0 > asprintf(&ompi_universe_info.name, "%s-%d", universe, pid) && ompi_rte_debug_flag) {
ompi_output(0, "mpi_init: error creating unique universe name");
}
}
ompi_process_info.my_universe = strdup(ompi_universe_info.name);
ompi_process_info.seed = true;
if (NULL != ompi_universe_info.ns_replica) {
free(ompi_universe_info.ns_replica);
ompi_universe_info.ns_replica = NULL;
}
if (NULL != ompi_process_info.ns_replica) {
free(ompi_process_info.ns_replica);
ompi_process_info.ns_replica = NULL;
}
if (NULL != ompi_universe_info.gpr_replica) {
free(ompi_universe_info.gpr_replica);
ompi_universe_info.gpr_replica = NULL;
}
if (NULL != ompi_process_info.gpr_replica) {
free(ompi_process_info.gpr_replica);
ompi_process_info.gpr_replica = NULL;
}
}
Some of these didn't really change - I was just in/out of them for diagnostics while chasing a bug. Got caught by my good buddy Tim again :) on his parse_contact_info function, which requires that the space for the answer be allocated in advance. Sigh. Anyway, mpirun2 now works again. My apologies if you tried it in the last few hours and found it didn't. Also removed the mpirun3 directory since we are basically dragging mpirun2 along with us - no need to create a new version after all. Made a few changes to the universe info structure, eliminating the "webserver" and "socket" fields since we will do those contacts through the oob channel. Also changed the "silent_mode" field to "console" since silent mode is the default - the flag needs to tell you to turn the console on, not off. Parse environ function now gets the ns and gpr replica contact info and loads it in the proper places to hand it off to the respective components, thus allowing me to check connection to them as part of determining if the named universe already exists. Changed the local_universe_exists function accordingly and gave it a new name (since the replicas may not be local). This name will shortly be changed to "ompi_rte_join_universe" as I complete the logic for doing that function. Please let me know if you see any problems. I successfully ran some trivial multi-process functions in both mpirun2 and singleton modes, and ran the seed daemon as well, so I think it should all be okay. This commit was SVN r2611.
2004-09-11 16:56:52 +04:00
/* start the rest of the rte */
2004-09-10 07:21:03 +04:00
if (OMPI_SUCCESS != (ret = ompi_rte_init_stage2(&allow_multi_user_threads,
&have_hidden_threads))) {
error = "mca_rte_init() failed";
goto error;
}
2004-09-10 07:21:03 +04:00
/***** SET MY NAME *****/
Some of these didn't really change - I was just in/out of them for diagnostics while chasing a bug. Got caught by my good buddy Tim again :) on his parse_contact_info function, which requires that the space for the answer be allocated in advance. Sigh. Anyway, mpirun2 now works again. My apologies if you tried it in the last few hours and found it didn't. Also removed the mpirun3 directory since we are basically dragging mpirun2 along with us - no need to create a new version after all. Made a few changes to the universe info structure, eliminating the "webserver" and "socket" fields since we will do those contacts through the oob channel. Also changed the "silent_mode" field to "console" since silent mode is the default - the flag needs to tell you to turn the console on, not off. Parse environ function now gets the ns and gpr replica contact info and loads it in the proper places to hand it off to the respective components, thus allowing me to check connection to them as part of determining if the named universe already exists. Changed the local_universe_exists function accordingly and gave it a new name (since the replicas may not be local). This name will shortly be changed to "ompi_rte_join_universe" as I complete the logic for doing that function. Please let me know if you see any problems. I successfully ran some trivial multi-process functions in both mpirun2 and singleton modes, and ran the seed daemon as well, so I think it should all be okay. This commit was SVN r2611.
2004-09-11 16:56:52 +04:00
if (NULL != ompi_process_info.name) { /* should NOT have been previously set */
free(ompi_process_info.name);
ompi_process_info.name = NULL;
2004-09-10 07:21:03 +04:00
}
if (NULL != ompi_rte_get_self()) { /* name set in environment - nonsingleton - record name */
ompi_process_info.name = ompi_rte_get_self();
} else if (NULL == ompi_process_info.ns_replica) { /* singleton - couldn't join existing univ */
ompi_process_info.name = ompi_name_server.create_process_name(0,0,0);
} else { /* singleton - name server exists elsewhere - get a name for me */
jobid = ompi_name_server.create_jobid();
vpid = ompi_name_server.reserve_range(jobid, 1);
ompi_process_info.name = ompi_name_server.create_process_name(0, jobid, vpid);
}
2004-09-10 07:21:03 +04:00
/* setup my session directory */
jobid_str = ompi_name_server.get_jobid_string(ompi_process_info.name);
procid_str = ompi_name_server.get_vpid_string(ompi_process_info.name);
if (ompi_rte_debug_flag) {
ompi_output(0, "[%d,%d,%d] setting up session dir with",
ompi_process_info.name->cellid,
ompi_process_info.name->jobid,
ompi_process_info.name->vpid);
if (NULL != ompi_process_info.tmpdir_base) {
ompi_output(0, "\ttmpdir %s", ompi_process_info.tmpdir_base);
}
ompi_output(0, "\tuniverse %s", ompi_process_info.my_universe);
ompi_output(0, "\tuser %s", ompi_system_info.user);
ompi_output(0, "\thost %s", ompi_system_info.nodename);
ompi_output(0, "\tjobid %s", jobid_str);
ompi_output(0, "\tprocid %s", procid_str);
}
if (OMPI_ERROR == ompi_session_dir(true,
ompi_process_info.tmpdir_base,
ompi_system_info.user,
ompi_system_info.nodename, NULL,
ompi_process_info.my_universe,
jobid_str, procid_str)) {
if (jobid_str != NULL) free(jobid_str);
if (procid_str != NULL) free(procid_str);
error = "session dir not found or created";
goto error;
}
2004-09-10 07:21:03 +04:00
/* finalize the rte startup */
if (OMPI_SUCCESS != (ret = ompi_rte_init_finalstage(&allow_multi_user_threads,
&have_hidden_threads))) {
error = "mpi_init: failed in ompi_rte_init\n";
goto error;
}
/*
* Register my process info with my replica. Note that this must be done
* after the rte init is completed.
*/
2004-09-23 18:35:02 +04:00
contact_info = mca_oob_get_contact_info();
ompi_rte_get_peers(NULL, &nprocs);
if (OMPI_SUCCESS != (ret = ompi_registry.rte_register(contact_info, nprocs,
ompi_rte_all_procs_registered, NULL,
ompi_rte_all_procs_unregistered, NULL))) {
error = "ompi_rte_init: failed in ompi_rte_register()\n";
goto error;
}
2004-09-23 18:35:02 +04:00
/* wait for all procs to have registered so we can be sure to get everyone's contact info */
if (OMPI_SUCCESS != (ret = ompi_rte_monitor_procs_registered())) {
error = "ompi_rte_init: failed to see all procs register\n";
goto error;
}
/* Once we've joined the RTE, see if any MCA parameters were
passed to the MPI level */
if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
error = "mca_mpi_register_params() failed";
goto error;
}
/* initialize ompi procs */
if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
error = "mca_proc_init() failed";
goto error;
}
/* Open up relevant MCA modules. */
if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
error = "mca_allocator_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
error = "mca_mpool_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
error = "mca_pml_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_ptl_base_open())) {
error = "mca_ptl_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
error = "mca_coll_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_topo_base_open())) {
error = "mca_topo_base_open() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_io_base_open())) {
error = "mca_io_base_open() failed";
goto error;
}
/* Select which pml, ptl, and coll modules to use, and determine the
2004-09-10 07:21:03 +04:00
final thread level */
if (OMPI_SUCCESS !=
2004-09-10 07:21:03 +04:00
(ret = mca_base_init_select_components(requested,
allow_multi_user_threads,
have_hidden_threads,
provided))) {
error = "mca_base_init_select_components() failed";
goto error;
}
/* initialize info */
if (OMPI_SUCCESS != (ret = ompi_info_init())) {
error = "ompi_info_init() failed";
goto error;
}
/* initialize error handlers */
if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
error = "ompi_errhandler_init() failed";
goto error;
}
/* initialize error codes */
if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
error = "ompi_mpi_errcode_init() failed";
goto error;
}
/* initialize error classes */
if (OMPI_SUCCESS != (ret = ompi_errclass_init())) {
error = "ompi_errclass_init() failed";
goto error;
}
/* initialize internal error codes */
if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
error = "ompi_errcode_intern_init() failed";
goto error;
}
/* initialize groups */
if (OMPI_SUCCESS != (ret = ompi_group_init())) {
error = "ompi_group_init() failed";
goto error;
}
/* initialize communicators */
if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
error = "ompi_comm_init() failed";
goto error;
}
/* initialize datatypes */
if (OMPI_SUCCESS != (ret = ompi_ddt_init())) {
error = "ompi_ddt_init() failed";
goto error;
}
/* initialize ops */
if (OMPI_SUCCESS != (ret = ompi_op_init())) {
error = "ompi_op_init() failed";
goto error;
}
/* initialize file handles */
if (OMPI_SUCCESS != (ret = ompi_file_init())) {
error = "ompi_file_init() failed";
goto error;
}
/* initialize attribute meta-data structure for comm/win/dtype */
if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
error = "ompi_attr_init() failed";
goto error;
}
/* do module exchange */
if (OMPI_SUCCESS != (ret = mca_base_modex_exchange())) {
error = "ompi_base_modex_exchange() failed";
goto error;
}
/* add all ompi_proc_t's to PML */
if (NULL == (procs = ompi_proc_world(&nprocs))) {
error = "ompi_proc_world() failed";
goto error;
}
if (OMPI_SUCCESS != (ret = mca_pml.pml_add_procs(procs, nprocs))) {
free(procs);
error = "PML add procs failed";
goto error;
}
free(procs);
/* start PTL's */
param = 1;
if (OMPI_SUCCESS !=
(ret = mca_pml.pml_control(MCA_PTL_ENABLE, &param, sizeof(param)))) {
error = "PML control failed";
goto error;
}
/* save the resulting thread levels */
ompi_mpi_thread_requested = requested;
*provided = ompi_mpi_thread_provided;
ompi_mpi_thread_multiple = (ompi_mpi_thread_provided ==
MPI_THREAD_MULTIPLE);
/* Init coll for the comms */
if (OMPI_SUCCESS !=
(ret = mca_coll_base_comm_select(MPI_COMM_SELF, NULL))) {
error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
goto error;
}
if (OMPI_SUCCESS !=
(ret = mca_coll_base_comm_select(MPI_COMM_WORLD, NULL))) {
error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
goto error;
}
/* Wait for everyone to initialize */
if (MPI_SUCCESS != (ret =
MPI_COMM_WORLD->c_coll.coll_barrier(MPI_COMM_WORLD))) {
error = "Barrier over MPI_COMM_WORLD failed";
goto error;
}
error:
if (ret != OMPI_SUCCESS) {
ompi_show_help("help-mpi-runtime",
"mpi_init:startup:internal-failure", true,
"MPI_INIT", "MPI_INIT", error, ret);
return ret;
}
/* All done */
ompi_mpi_initialized = true;
ompi_mpi_finalized = false;
return MPI_SUCCESS;
}