2004-01-10 01:09:51 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2006-05-11 23:46:21 +04:00
|
|
|
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
2007-07-26 01:01:10 +04:00
|
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-01-10 01:09:51 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
|
|
|
|
/** @file
|
|
|
|
* Process identification structure interface
|
|
|
|
*
|
|
|
|
* Process identification structure interface. The ompi_proc_t
|
|
|
|
* structure contatins basic information about the remote (and local)
|
|
|
|
* processes.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef OMPI_PROC_PROC_H
|
|
|
|
#define OMPI_PROC_PROC_H
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/types.h"
|
2005-07-03 20:22:16 +04:00
|
|
|
#include "opal/class/opal_list.h"
|
2005-07-04 02:45:48 +04:00
|
|
|
#include "opal/threads/mutex.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
#include "orte/types.h"
|
|
|
|
#include "opal/dss/dss_types.h"
|
2005-03-14 23:57:21 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
BEGIN_C_DECLS
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
/* ******************************************************************** */
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Remote Open MPI process structure
|
|
|
|
*
|
|
|
|
* Remote Open MPI process structure. Each process contains exactly
|
|
|
|
* one ompi_proc_t structure for each remote process it knows about.
|
|
|
|
*/
|
2004-06-07 19:33:53 +04:00
|
|
|
struct ompi_proc_t {
|
2005-07-15 02:43:01 +04:00
|
|
|
/** allow proc to be placed on a list */
|
2007-07-26 01:01:10 +04:00
|
|
|
opal_list_item_t super;
|
2005-07-15 02:43:01 +04:00
|
|
|
/** this process' name */
|
2007-07-26 01:01:10 +04:00
|
|
|
orte_process_name_t proc_name;
|
2008-02-28 04:57:57 +03:00
|
|
|
/** "nodeid" on which the proc resides - equiv to vpid of local daemon */
|
|
|
|
orte_vpid_t proc_nodeid;
|
2005-07-15 02:43:01 +04:00
|
|
|
/** PML specific proc data */
|
2006-07-04 05:20:20 +04:00
|
|
|
struct mca_pml_base_endpoint_t* proc_pml;
|
2007-07-26 01:01:10 +04:00
|
|
|
/** BML specific proc data */
|
|
|
|
struct mca_bml_base_endpoint_t* proc_bml;
|
2005-07-15 02:43:01 +04:00
|
|
|
/** architecture of this process */
|
2007-07-26 01:01:10 +04:00
|
|
|
uint32_t proc_arch;
|
|
|
|
/** Base convertor for the proc described by this process */
|
|
|
|
struct ompi_convertor_t* proc_convertor;
|
|
|
|
/** Lock protecting data inside the given ompi_proc_t */
|
|
|
|
opal_mutex_t proc_lock;
|
2006-05-11 23:46:21 +04:00
|
|
|
/** Keep the hostname around for debugging purposes */
|
2007-07-26 01:01:10 +04:00
|
|
|
char* proc_hostname;
|
2005-07-15 02:43:01 +04:00
|
|
|
/** flags for this proc */
|
2007-07-26 01:01:10 +04:00
|
|
|
uint8_t proc_flags;
|
2004-01-10 01:09:51 +03:00
|
|
|
};
|
2004-06-07 19:33:53 +04:00
|
|
|
typedef struct ompi_proc_t ompi_proc_t;
|
2007-07-26 01:01:10 +04:00
|
|
|
OBJ_CLASS_DECLARATION(ompi_proc_t);
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2005-07-15 02:43:01 +04:00
|
|
|
|
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* @private
|
|
|
|
*
|
|
|
|
* Pointer to the ompi_proc_t structure for the local process
|
|
|
|
*
|
|
|
|
* Pointer to the ompi_proc_t structure for the local process.
|
|
|
|
*
|
|
|
|
* @note This pointer is declared here to allow inline functions
|
|
|
|
* within this header file to access the local process quickly.
|
|
|
|
* Please use ompi_proc_local() instead.
|
2005-07-15 02:43:01 +04:00
|
|
|
*/
|
2007-07-26 01:01:10 +04:00
|
|
|
OMPI_DECLSPEC extern ompi_proc_t* ompi_proc_local_proc;
|
|
|
|
|
|
|
|
|
|
|
|
/* ******************************************************************** */
|
|
|
|
|
|
|
|
|
|
|
|
/** Process is on the same node as the local process */
|
2005-07-15 02:43:01 +04:00
|
|
|
#define OMPI_PROC_FLAG_LOCAL 0x01
|
|
|
|
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
/* ******************************************************************** */
|
|
|
|
|
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Initialize the OMPI process subsystem
|
|
|
|
*
|
|
|
|
* Initialize the Open MPI process subsystem. This function will
|
|
|
|
* query the run-time environment and build a list of the proc
|
|
|
|
* instances in the current MPI_COMM_WORLD. The local information not
|
|
|
|
* easily determined by the run-time ahead of time (architecture and
|
|
|
|
* hostname) will be published during this call.
|
|
|
|
*
|
|
|
|
* @note While an ompi_proc_t will exist with mostly valid information
|
|
|
|
* for each process in the MPI_COMM_WORLD at the conclusion of this
|
|
|
|
* call, some information will not be immediately available. This
|
|
|
|
* includes the architecture and hostname, which will be available by
|
|
|
|
* the conclusion of the stage gate.
|
|
|
|
*
|
|
|
|
* @retval OMPI_SUCESS System successfully initialized
|
2008-02-28 04:57:57 +03:00
|
|
|
* @retval OMPI_ERROR Initialization failed due to unspecified error
|
2004-02-13 16:56:55 +03:00
|
|
|
*/
|
2008-03-05 16:59:25 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_init(void);
|
2004-02-13 16:56:55 +03:00
|
|
|
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
/**
|
|
|
|
* Publish local process information
|
|
|
|
*
|
|
|
|
* Used by ompi_proc_init() and elsewhere in the code to refresh any
|
|
|
|
* local information not easily determined by the run-time ahead of time
|
|
|
|
* (architecture and hostname).
|
|
|
|
*
|
|
|
|
* @note While an ompi_proc_t will exist with mostly valid information
|
|
|
|
* for each process in the MPI_COMM_WORLD at the conclusion of this
|
|
|
|
* call, some information will not be immediately available. This
|
|
|
|
* includes the architecture and hostname, which will be available by
|
|
|
|
* the conclusion of the stage gate.
|
|
|
|
*
|
|
|
|
* @retval OMPI_SUCESS Information available in the modex
|
2008-02-28 04:57:57 +03:00
|
|
|
* @retval OMPI_ERROR Failure due to unspecified error
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
*/
|
2008-03-05 16:59:25 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_publish_info(void);
|
2007-07-26 01:01:10 +04:00
|
|
|
|
2007-08-09 22:53:28 +04:00
|
|
|
/**
|
|
|
|
* Get data exchange information from remote processes
|
|
|
|
*
|
|
|
|
* Get data exchanged from remote processes and populate the ompi proc
|
|
|
|
* structures for the associated processes.
|
|
|
|
*
|
|
|
|
* @retval OMPI_SUCCESS Information successfully received
|
|
|
|
* @retval OMPI_ERROR Information update failure
|
|
|
|
*/
|
2008-03-05 16:59:25 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_get_info(void);
|
2007-08-09 22:53:28 +04:00
|
|
|
|
|
|
|
|
2004-12-02 16:28:10 +03:00
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Finalize the OMPI Process subsystem
|
|
|
|
*
|
|
|
|
* Finalize the Open MPI process subsystem. This function will
|
|
|
|
* release all memory created during the life of the application,
|
|
|
|
* including all ompi_proc_t structures.
|
|
|
|
*
|
|
|
|
* @retval OMPI_SUCCESS System successfully finalized
|
2004-12-02 16:28:10 +03:00
|
|
|
*/
|
2008-03-05 16:59:25 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_finalize(void);
|
2004-12-02 16:28:10 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
/**
|
|
|
|
* Returns the list of proc instances associated with this job.
|
2007-07-26 01:01:10 +04:00
|
|
|
*
|
|
|
|
* Returns the list of proc instances associated with this job. Given
|
|
|
|
* the current association between a job and an MPI_COMM_WORLD, this
|
|
|
|
* function provides the process instances for the current
|
|
|
|
* MPI_COMM_WORLD.
|
|
|
|
*
|
|
|
|
* @note The reference count of each process in the array is
|
|
|
|
* incremented and the caller is responsible for releasing each
|
|
|
|
* process in the array, as well as freeing the array.
|
|
|
|
*
|
|
|
|
* @param[in] size Number of processes in the ompi_proc_t array
|
|
|
|
*
|
|
|
|
* @return Array of pointers to proc instances in the current
|
|
|
|
* MPI_COMM_WORLD, or NULL if there is an internal failure.
|
2004-02-13 16:56:55 +03:00
|
|
|
*/
|
2007-02-27 18:17:17 +03:00
|
|
|
OMPI_DECLSPEC ompi_proc_t** ompi_proc_world(size_t* size);
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
/**
|
|
|
|
* Returns the list of all known proc instances.
|
2007-07-26 01:01:10 +04:00
|
|
|
*
|
|
|
|
* Returns the list of all known proc instances, including those in
|
|
|
|
* other MPI_COMM_WORLDs. It is possible that we may no longer be
|
|
|
|
* connected to some of the procs returned (in the MPI sense of the
|
|
|
|
* word connected). In a strictly MPI-1 application, this function
|
|
|
|
* will return the same information as ompi_proc_world().
|
|
|
|
*
|
|
|
|
* @note The reference count of each process in the array is
|
|
|
|
* incremented and the caller is responsible for releasing each
|
|
|
|
* process in the array, as well as freeing the array.
|
|
|
|
*
|
|
|
|
* @param[in] size Number of processes in the ompi_proc_t array
|
|
|
|
*
|
|
|
|
* @return Array of pointers to proc instances in the current
|
|
|
|
* known universe, or NULL if there is an internal failure.
|
2004-02-13 16:56:55 +03:00
|
|
|
*/
|
2007-02-27 18:17:17 +03:00
|
|
|
OMPI_DECLSPEC ompi_proc_t** ompi_proc_all(size_t* size);
|
2004-02-13 16:56:55 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Returns a list of the local process
|
|
|
|
*
|
|
|
|
* Returns a list containing the local process (and only the local
|
|
|
|
* process). Has calling semantics similar to ompi_proc_world() and
|
|
|
|
* ompi_proc_all().
|
|
|
|
*
|
|
|
|
* @note The reference count of each process in the array is
|
|
|
|
* incremented and the caller is responsible for releasing each
|
|
|
|
* process in the array, as well as freeing the array.
|
|
|
|
*
|
|
|
|
* @param[in] size Number of processes in the ompi_proc_t array
|
|
|
|
*
|
|
|
|
* @return Array of pointers to proc instances in the current
|
|
|
|
* known universe, or NULL if there is an internal failure.
|
2004-02-13 16:56:55 +03:00
|
|
|
*/
|
2007-07-26 01:01:10 +04:00
|
|
|
OMPI_DECLSPEC ompi_proc_t** ompi_proc_self(size_t* size);
|
|
|
|
|
2004-01-29 18:34:47 +03:00
|
|
|
|
2004-02-13 16:56:55 +03:00
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Returns a pointer to the local process
|
|
|
|
*
|
|
|
|
* Returns a pointer to the local process. Unlike ompi_proc_self(),
|
|
|
|
* the reference count on the local proc instance is not modified by
|
|
|
|
* this function.
|
|
|
|
*
|
|
|
|
* @return Pointer to the local process structure
|
2004-02-13 16:56:55 +03:00
|
|
|
*/
|
2004-06-07 19:33:53 +04:00
|
|
|
static inline ompi_proc_t* ompi_proc_local(void)
|
2004-10-28 22:13:43 +04:00
|
|
|
{
|
2004-06-07 19:33:53 +04:00
|
|
|
return ompi_proc_local_proc;
|
2004-01-29 18:34:47 +03:00
|
|
|
}
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
|
2004-05-18 01:28:32 +04:00
|
|
|
/**
|
2004-07-01 18:49:54 +04:00
|
|
|
* Returns the proc instance for a given name
|
2007-07-26 01:01:10 +04:00
|
|
|
*
|
|
|
|
* Returns the proc instance for the specified process name. The
|
|
|
|
* reference count for the proc instance is not incremented by this
|
|
|
|
* function.
|
|
|
|
*
|
|
|
|
* @param[in] name The process name to look for
|
|
|
|
*
|
|
|
|
* @return Pointer to the process instance for \c name
|
2004-05-18 01:28:32 +04:00
|
|
|
*/
|
2007-07-26 01:01:10 +04:00
|
|
|
OMPI_DECLSPEC ompi_proc_t * ompi_proc_find ( const orte_process_name_t* name );
|
2004-08-04 21:05:22 +04:00
|
|
|
|
2004-09-17 14:10:24 +04:00
|
|
|
|
2004-08-04 21:05:22 +04:00
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Pack proc list into portable buffer
|
|
|
|
*
|
Clean up the way procs are added to the global process list after MPI_INIT:
* Do not add new procs to the global list during modex callback or
when sharing orte names during accept/connect. For modex, we
cache the modex info for later, in case that proc ever does get
added to the global proc list. For accept/connect orte name
exchange between the roots, we only need the orte name, so no
need to add a proc structure anyway. The procs will be added
to the global process list during the proc exchange later in
the wireup process
* Rename proc_get_namebuf and proc_get_proclist to proc_pack
and proc_unpack and extend them to include all information
needed to build that proc struct on a remote node (which
includes ORTE name, architecture, and hostname). Change
unpack to call pml_add_procs for the entire list of new
procs at once, rather than one at a time.
* Remove ompi_proc_find_and_add from the public proc
interface and make it a private function. This function
would add a half-created proc to the global proc list, so
making it harder to call is a good thing.
This means that there's only two ways to add new procs into the global proc list at this time: During MPI_INIT via the call to ompi_proc_init, where my job is added to the list and via ompi_proc_unpack using a buffer from a packed proc list sent to us by someone else. Currently, this is enough to implement MPI semantics. We can extend the interface more if we like, but that may require HNP communication to get the remote proc information and I wanted to avoid that if at all possible.
Refs trac:564
This commit was SVN r12798.
The following Trac tickets were found above:
Ticket 564 --> https://svn.open-mpi.org/trac/ompi/ticket/564
2006-12-07 22:56:54 +03:00
|
|
|
* This function takes a list of ompi_proc_t pointers (e.g. as given
|
|
|
|
* in groups) and returns a orte buffer containing all information
|
|
|
|
* needed to add the proc to a remote list. This includes the ORTE
|
|
|
|
* process name, the architecture, and the hostname. Ordering is
|
|
|
|
* maintained. The buffer is packed to be sent to a remote node with
|
|
|
|
* different architecture (endian or word size). The buffer can be
|
2007-07-26 01:01:10 +04:00
|
|
|
* dss unloaded to be sent using MPI or send using rml_send_packed().
|
|
|
|
*
|
|
|
|
* @param[in] proclist List of process pointers
|
|
|
|
* @param[in] proclistsize Length of the proclist array
|
|
|
|
* @param[in,out] buf An orte_buffer containing the packed names.
|
|
|
|
* The buffer must be constructed but empty when
|
|
|
|
* passed to this function
|
|
|
|
* @retval OMPI_SUCCESS Success
|
|
|
|
* @retval OMPI_ERROR Unspecified error
|
2004-08-04 21:05:22 +04:00
|
|
|
*/
|
2008-02-28 04:57:57 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
|
|
|
|
opal_buffer_t *buf);
|
2004-08-04 21:05:22 +04:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
2007-07-26 01:01:10 +04:00
|
|
|
* Unpack a portable buffer of procs
|
2004-08-04 21:05:22 +04:00
|
|
|
*
|
Clean up the way procs are added to the global process list after MPI_INIT:
* Do not add new procs to the global list during modex callback or
when sharing orte names during accept/connect. For modex, we
cache the modex info for later, in case that proc ever does get
added to the global proc list. For accept/connect orte name
exchange between the roots, we only need the orte name, so no
need to add a proc structure anyway. The procs will be added
to the global process list during the proc exchange later in
the wireup process
* Rename proc_get_namebuf and proc_get_proclist to proc_pack
and proc_unpack and extend them to include all information
needed to build that proc struct on a remote node (which
includes ORTE name, architecture, and hostname). Change
unpack to call pml_add_procs for the entire list of new
procs at once, rather than one at a time.
* Remove ompi_proc_find_and_add from the public proc
interface and make it a private function. This function
would add a half-created proc to the global proc list, so
making it harder to call is a good thing.
This means that there's only two ways to add new procs into the global proc list at this time: During MPI_INIT via the call to ompi_proc_init, where my job is added to the list and via ompi_proc_unpack using a buffer from a packed proc list sent to us by someone else. Currently, this is enough to implement MPI semantics. We can extend the interface more if we like, but that may require HNP communication to get the remote proc information and I wanted to avoid that if at all possible.
Refs trac:564
This commit was SVN r12798.
The following Trac tickets were found above:
Ticket 564 --> https://svn.open-mpi.org/trac/ompi/ticket/564
2006-12-07 22:56:54 +03:00
|
|
|
* This function unpacks a packed list of ompi_proc_t structures and
|
|
|
|
* returns the ordered list of proc structures. If the given proc is
|
|
|
|
* already "known", the architecture and hostname information in the
|
|
|
|
* buffer is ignored. If the proc is "new" to this process, it will
|
|
|
|
* be added to the global list of known procs, with information
|
|
|
|
* provided in the buffer. The lookup actions are always entirely
|
|
|
|
* local. The proclist returned is a list of pointers to all procs in
|
|
|
|
* the buffer, whether they were previously known or are new to this
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
* process.
|
|
|
|
*
|
|
|
|
* @note In previous versions of this function, The PML's add_procs()
|
|
|
|
* function was called for any new processes discovered as a result of
|
|
|
|
* this operation. That is no longer the case -- the caller must use
|
|
|
|
* the newproclist information to call add_procs() if necessary.
|
|
|
|
*
|
|
|
|
* @note The reference count for procs created as a result of this
|
|
|
|
* operation will be set to 1. Existing procs will not have their
|
|
|
|
* reference count changed. The reference count of a proc at the
|
|
|
|
* return of this function is the same regardless of whether NULL is
|
|
|
|
* provided for newproclist. The user is responsible for freeing the
|
|
|
|
* newproclist array.
|
2004-08-04 21:05:22 +04:00
|
|
|
*
|
2007-07-26 01:01:10 +04:00
|
|
|
* @param[in] buf orte_buffer containing the packed names
|
|
|
|
* @param[in] proclistsize number of expected proc-pointres
|
|
|
|
* @param[out] proclist list of process pointers
|
These changes were mostly captured in a prior RFC (except for #2 below) and are aimed specifically at improving startup performance and setting up the remaining modifications described in that RFC.
The commit has been tested for C/R and Cray operations, and on Odin (SLURM, rsh) and RoadRunner (TM). I tried to update all environments, but obviously could not test them. I know that Windows needs some work, and have highlighted what is know to be needed in the odls process component.
This represents a lot of work by Brian, Tim P, Josh, and myself, with much advice from Jeff and others. For posterity, I have appended a copy of the email describing the work that was done:
As we have repeatedly noted, the modex operation in MPI_Init is the single greatest consumer of time during startup. To-date, we have executed that operation as an ORTE stage gate that held the process until a startup message containing all required modex (and OOB contact info - see #3 below) info could be sent to it. Each process would send its data to the HNP's registry, which assembled and sent the message when all processes had reported in.
In addition, ORTE had taken responsibility for monitoring process status as it progressed through a series of "stage gates". The process reported its status at each gate, and ORTE would then send a "release" message once all procs had reported in.
The incoming changes revamp these procedures in three ways:
1. eliminating the ORTE stage gate system and cleanly delineating responsibility between the OMPI and ORTE layers for MPI init/finalize. The modex stage gate (STG1) has been replaced by a collective operation in the modex itself that performs an allgather on the required modex info. The allgather is implemented using the orte_grpcomm framework since the BTL's are not active at that point. At the moment, the grpcomm framework only has a "basic" component analogous to OMPI's "basic" coll framework - I would recommend that the MPI team create additional, more advanced components to improve performance of this step.
The other stage gates have been replaced by orte_grpcomm barrier functions. We tried to use MPI barriers instead (since the BTL's are active at that point), but - as we discussed on the telecon - these are not currently true barriers so the job would hang when we fell through while messages were still in process. Note that the grpcomm barrier doesn't actually resolve that problem, but Brian has pointed out that we are unlikely to ever see it violated. Again, you might want to spend a little time on an advanced barrier algorithm as the one in "basic" is very simplistic.
Summarizing this change: ORTE no longer tracks process state nor has direct responsibility for synchronizing jobs. This is now done via collective operations within the MPI layer, albeit using ORTE collective communication services. I -strongly- urge the MPI team to implement advanced collective algorithms to improve the performance of this critical procedure.
2. reducing the volume of data exchanged during modex. Data in the modex consisted of the process name, the name of the node where that process is located (expressed as a string), plus a string representation of all contact info. The nodename was required in order for the modex to determine if the process was local or not - in addition, some people like to have it to print pretty error messages when a connection failed.
The size of this data has been reduced in three ways:
(a) reducing the size of the process name itself. The process name consisted of two 32-bit fields for the jobid and vpid. This is far larger than any current system, or system likely to exist in the near future, can support. Accordingly, the default size of these fields has been reduced to 16-bits, which means you can have 32k procs in each of 32k jobs. Since the daemons must have a vpid, and we require one daemon/node, this also restricts the default configuration to 32k nodes.
To support any future "mega-clusters", a configuration option --enable-jumbo-apps has been added. This option increases the jobid and vpid field sizes to 32-bits. Someday, if necessary, someone can add yet another option to increase them to 64-bits, I suppose.
(b) replacing the string nodename with an integer nodeid. Since we have one daemon/node, the nodeid corresponds to the local daemon's vpid. This replaces an often lengthy string with only 2 (or at most 4) bytes, a substantial reduction.
(c) when the mca param requesting that nodenames be sent to support pretty error messages, a second mca param is now used to request FQDN - otherwise, the domain name is stripped (by default) from the message to save space. If someone wants to combine those into a single param somehow (perhaps with an argument?), they are welcome to do so - I didn't want to alter what people are already using.
While these may seem like small savings, they actually amount to a significant impact when aggregated across the entire modex operation. Since every proc must receive the modex data regardless of the collective used to send it, just reducing the size of the process name removes nearly 400MBytes of communication from a 32k proc job (admittedly, much of this comm may occur in parallel). So it does add up pretty quickly.
3. routing RML messages to reduce connections. The default messaging system remains point-to-point - i.e., each proc opens a socket to every proc it communicates with and sends its messages directly. A new option uses the orteds as routers - i.e., each proc only opens a single socket to its local orted. All messages are sent from the proc to the orted, which forwards the message to the orted on the node where the intended recipient proc is located - that orted then forwards the message to its local proc (the recipient). This greatly reduces the connection storm we have encountered during startup.
It also has the benefit of removing the sharing of every proc's OOB contact with every other proc. The orted routing tables are populated during launch since every orted gets a map of where every proc is being placed. Each proc, therefore, only needs to know the contact info for its local daemon, which is passed in via the environment when the proc is fork/exec'd by the daemon. This alone removes ~50 bytes/process of communication that was in the current STG1 startup message - so for our 32k proc job, this saves us roughly 32k*50 = 1.6MBytes sent to 32k procs = 51GBytes of messaging.
Note that you can use the new routing method by specifying -mca routed tree - if you so desire. This mode will become the default at some point in the future.
There are a few minor additional changes in the commit that I'll just note in passing:
* propagation of command line mca params to the orteds - fixes ticket #1073. See note there for details.
* requiring of "finalize" prior to "exit" for MPI procs - fixes ticket #1144. See note there for details.
* cleanup of some stale header files
This commit was SVN r16364.
2007-10-05 23:48:23 +04:00
|
|
|
* @param[out] newproclistsize Number of new procs added as a result
|
|
|
|
* of the unpack operation. NULL may be
|
|
|
|
* provided if information is not needed.
|
|
|
|
* @param[out] newproclist List of new procs added as a result of
|
|
|
|
* the unpack operation. NULL may be
|
|
|
|
* provided if informationis not needed.
|
2007-07-26 01:01:10 +04:00
|
|
|
*
|
2004-08-04 21:05:22 +04:00
|
|
|
* Return value:
|
|
|
|
* OMPI_SUCCESS on success
|
|
|
|
* OMPI_ERROR else
|
|
|
|
*/
|
2008-02-28 04:57:57 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_unpack(opal_buffer_t *buf,
|
|
|
|
int proclistsize, ompi_proc_t ***proclist,
|
|
|
|
int *newproclistsize, ompi_proc_t ***newproclist);
|
2004-08-04 21:05:22 +04:00
|
|
|
|
|
|
|
|
2008-02-28 04:57:57 +03:00
|
|
|
/**
|
|
|
|
* Refresh the OMPI process subsystem
|
|
|
|
*
|
|
|
|
* Refrsh the Open MPI process subsystem. This function will update
|
|
|
|
* the list of proc instances in the current MPI_COMM_WORLD with
|
|
|
|
* data from the run-time environemnt.
|
|
|
|
*
|
|
|
|
* @note This is primarily used when restarting a process and thus
|
|
|
|
* need to update the jobid and node name.
|
|
|
|
*
|
|
|
|
* @retval OMPI_SUCESS System successfully refreshed
|
|
|
|
* @retval OMPI_ERROR Refresh failed due to unspecified error
|
|
|
|
*/
|
2008-03-05 16:59:25 +03:00
|
|
|
OMPI_DECLSPEC int ompi_proc_refresh(void);
|
2008-02-28 04:57:57 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
END_C_DECLS
|
2004-01-10 01:09:51 +03:00
|
|
|
|
2007-07-26 01:01:10 +04:00
|
|
|
#endif /* OMPI_PROC_PROC_H */
|