1
1
openmpi/ompi/mca/rte/orte/rte_orte.h

118 строки
3.9 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
2015-06-23 20:59:57 -07:00
*
* Additional copyrights may follow
2015-06-23 20:59:57 -07:00
*
* $HEADER$
*
* When this component is used, this file is included in the rest of
* the OPAL/ORTE/OMPI code base via ompi/mca/rte/rte.h. As such,
* this header represents the public interface to this static component.
*/
#ifndef MCA_OMPI_RTE_ORTE_H
#define MCA_OMPI_RTE_ORTE_H
#include "ompi_config.h"
#include "ompi/constants.h"
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-) WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic. This commit was SVN r32317.
2014-07-26 00:47:28 +00:00
struct opal_proc_t;
#include "opal/threads/threads.h"
#include "orte/types.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/routed/routed.h"
#include "orte/runtime/orte_data_server.h"
#include "orte/runtime/runtime.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "ompi/info/info.h"
struct ompi_proc_t;
struct ompi_communicator_t;
BEGIN_C_DECLS
/* Process name objects and operations */
typedef orte_process_name_t ompi_process_name_t;
typedef orte_jobid_t ompi_jobid_t;
typedef orte_vpid_t ompi_vpid_t;
typedef orte_ns_cmp_bitmask_t ompi_rte_cmp_bitmask_t;
#define OMPI_PROC_MY_NAME ORTE_PROC_MY_NAME
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-) WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic. This commit was SVN r32317.
2014-07-26 00:47:28 +00:00
#define OMPI_NAME_PRINT(a) ORTE_NAME_PRINT((const orte_process_name_t*)a)
#define ompi_rte_compare_name_fields(a, b, c) orte_util_compare_name_fields(a, (const orte_process_name_t*)(b), (const orte_process_name_t*)(c))
#define ompi_rte_convert_string_to_process_name(a,b) orte_util_convert_string_to_process_name(a,b)
#define ompi_rte_convert_process_name_to_string(a,b) orte_util_convert_process_name_to_string(a,b)
#define OMPI_NAME_WILDCARD ORTE_NAME_WILDCARD
#define OMPI_NODE_RANK_INVALID ORTE_NODE_RANK_INVALID
#define OMPI_LOCAL_RANK_INVALID ORTE_LOCAL_RANK_INVALID
#define OMPI_RTE_CMP_JOBID ORTE_NS_CMP_JOBID
#define OMPI_RTE_CMP_VPID ORTE_NS_CMP_VPID
#define OMPI_RTE_CMP_ALL ORTE_NS_CMP_ALL
/* This is the DSS tag to serialize a proc name */
#define OMPI_NAME ORTE_NAME
#define OMPI_PROCESS_NAME_HTON ORTE_PROCESS_NAME_HTON
#define OMPI_PROCESS_NAME_NTOH ORTE_PROCESS_NAME_NTOH
#define OMPI_RTE_MY_NODEID ORTE_PROC_MY_DAEMON->vpid
/* database keys */
#define OMPI_RTE_NODE_ID ORTE_DB_DAEMON_VPID
#define OMPI_RTE_HOST_ID ORTE_DB_HOSTID
#if OPAL_ENABLE_DEBUG
static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name);
#else
#define OMPI_CAST_RTE_NAME(a) ((orte_process_name_t*)(a))
#endif
/* Process info struct and values */
typedef orte_node_rank_t ompi_node_rank_t;
typedef orte_local_rank_t ompi_local_rank_t;
#define ompi_process_info orte_process_info
#define ompi_rte_proc_is_bound orte_proc_is_bound
/* Error handling objects and operations */
2015-06-23 20:59:57 -07:00
OMPI_DECLSPEC void __opal_attribute_noreturn__
ompi_rte_abort(int error_code, char *fmt, ...);
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
/* Init and finalize objects and operations */
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
#define ompi_rte_finalize() orte_finalize()
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);
typedef struct {
ompi_rte_component_t super;
opal_mutex_t lock;
opal_list_t modx_reqs;
} ompi_rte_orte_component_t;
typedef struct {
opal_list_item_t super;
opal_mutex_t lock;
opal_condition_t cond;
bool active;
orte_process_name_t peer;
} ompi_orte_tracker_t;
OBJ_CLASS_DECLARATION(ompi_orte_tracker_t);
For large scale systems, we would like to avoid doing a full modex during MPI_Init so that launch will scale a little better. At the moment, our options are somewhat limited as only a few BTLs don't immediately call modex_recv on all procs during startup. However, for those situations where someone can take advantage of it, add the ability to do a "modex on demand" retrieval of data from remote procs when we launch via mpirun. NOTE: launch performance will be absolutely awful if you do this with BTLs that aren't configured to modex_recv on first message! Even with "modex on demand", we still have to do a barrier in place of the modex - we simply don't move any data around, which does reduce the time impact. The barrier is required to ensure that the other proc has in fact registered all its BTL info and therefore is prepared to hand over a complete data package. Otherwise, you may not get the info you need. In addition, the shared memory BTL can fail to properly rendezvous as it expects the barrier to be in place. This behavior will *only* take effect under the following conditions: 1. launched via mpirun 2. #procs is greater than ompi_hostname_cutoff, which defaults to UINT32_MAX 3. mca param rte_orte_direct_modex is set to 1. At the moment, we are having problems getting this param to register properly, so only the first two conditions are in effect. Still, the bottom line is you have to *want* this behavior to get it. The planned next evolution of this will be to make the direct modex be non-blocking - this will require two fixes: 1. if the remote proc doesn't have the required info, then let it delay its response until it does. This means we need a way for the MPI layer to tell the RTE "I am done entering modex data". 2. adjust the SM rendezvous logic to loop until the required file has been created Creating a placeholder to bring this over to 1.7.5 when ready. cmr=v1.7.5:reviewer=hjelmn:subject=Enable direct modex at scale This commit was SVN r30259.
2014-01-11 17:36:06 +00:00
#if OPAL_ENABLE_DEBUG
static inline orte_process_name_t * OMPI_CAST_RTE_NAME(opal_process_name_t * name) {
return (orte_process_name_t *)name;
}
#endif
Per the PMIx RFC: WHAT: Merge the PMIx branch into the devel repo, creating a new OPAL “lmix” framework to abstract PMI support for all RTEs. Replace the ORTE daemon-level collectives with a new PMIx server and update the ORTE grpcomm framework to support server-to-server collectives WHY: We’ve had problems dealing with variations in PMI implementations, and need to extend the existing PMI definitions to meet exascale requirements. WHEN: Mon, Aug 25 WHERE: https://github.com/rhc54/ompi-svn-mirror.git Several community members have been working on a refactoring of the current PMI support within OMPI. Although the APIs are common, Slurm and Cray implement a different range of capabilities, and package them differently. For example, Cray provides an integrated PMI-1/2 library, while Slurm separates the two and requires the user to specify the one to be used at runtime. In addition, several bugs in the Slurm implementations have caused problems requiring extra coding. All this has led to a slew of #if’s in the PMI code and bugs when the corner-case logic for one implementation accidentally traps the other. Extending this support to other implementations would have increased this complexity to an unacceptable level. Accordingly, we have: * created a new OPAL “pmix” framework to abstract the PMI support, with separate components for Cray, Slurm PMI-1, and Slurm PMI-2 implementations. * Replaced the current ORTE grpcomm daemon-based collective operation with an integrated PMIx server, and updated the grpcomm APIs to provide more flexible, multi-algorithm support for collective operations. At this time, only the xcast and allgather operations are supported. * Replaced the current global collective id with a signature based on the names of the participating procs. The allows an unlimited number of collectives to be executed by any group of processes, subject to the requirement that only one collective can be active at a time for a unique combination of procs. Note that a proc can be involved in any number of simultaneous collectives - it is the specific combination of procs that is subject to the constraint * removed the prior OMPI/OPAL modex code * added new macros for executing modex send/recv to simplify use of the new APIs. The send macros allow the caller to specify whether or not the BTL supports async modex operations - if so, then the non-blocking “fence” operation is used, if the active PMIx component supports it. Otherwise, the default is a full blocking modex exchange as we currently perform. * retained the current flag that directs us to use a blocking fence operation, but only to retrieve data upon demand This commit was SVN r32570.
2014-08-21 18:56:47 +00:00
END_C_DECLS
#endif /* MCA_OMPI_RTE_ORTE_H */