
WHAT: Merge the PMIx branch into the devel repo, creating a new OPAL “lmix” framework to abstract PMI support for all RTEs. Replace the ORTE daemon-level collectives with a new PMIx server and update the ORTE grpcomm framework to support server-to-server collectives WHY: We’ve had problems dealing with variations in PMI implementations, and need to extend the existing PMI definitions to meet exascale requirements. WHEN: Mon, Aug 25 WHERE: https://github.com/rhc54/ompi-svn-mirror.git Several community members have been working on a refactoring of the current PMI support within OMPI. Although the APIs are common, Slurm and Cray implement a different range of capabilities, and package them differently. For example, Cray provides an integrated PMI-1/2 library, while Slurm separates the two and requires the user to specify the one to be used at runtime. In addition, several bugs in the Slurm implementations have caused problems requiring extra coding. All this has led to a slew of #if’s in the PMI code and bugs when the corner-case logic for one implementation accidentally traps the other. Extending this support to other implementations would have increased this complexity to an unacceptable level. Accordingly, we have: * created a new OPAL “pmix” framework to abstract the PMI support, with separate components for Cray, Slurm PMI-1, and Slurm PMI-2 implementations. * Replaced the current ORTE grpcomm daemon-based collective operation with an integrated PMIx server, and updated the grpcomm APIs to provide more flexible, multi-algorithm support for collective operations. At this time, only the xcast and allgather operations are supported. * Replaced the current global collective id with a signature based on the names of the participating procs. The allows an unlimited number of collectives to be executed by any group of processes, subject to the requirement that only one collective can be active at a time for a unique combination of procs. Note that a proc can be involved in any number of simultaneous collectives - it is the specific combination of procs that is subject to the constraint * removed the prior OMPI/OPAL modex code * added new macros for executing modex send/recv to simplify use of the new APIs. The send macros allow the caller to specify whether or not the BTL supports async modex operations - if so, then the non-blocking “fence” operation is used, if the active PMIx component supports it. Otherwise, the default is a full blocking modex exchange as we currently perform. * retained the current flag that directs us to use a blocking fence operation, but only to retrieve data upon demand This commit was SVN r32570.
428 строки
16 KiB
C
428 строки
16 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/dstore/dstore.h"
|
|
#include "opal/util/argv.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/state/state.h"
|
|
#include "orte/mca/rml/rml.h"
|
|
|
|
#include "orte/mca/oob/base/base.h"
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
#include "orte/mca/state/base/base.h"
|
|
#endif
|
|
|
|
static void process_uri(char *uri);
|
|
|
|
void orte_oob_base_send_nb(int fd, short args, void *cbdata)
|
|
{
|
|
orte_oob_send_t *cd = (orte_oob_send_t*)cbdata;
|
|
orte_rml_send_t *msg = cd->msg;
|
|
mca_base_component_list_item_t *cli;
|
|
orte_oob_base_peer_t *pr;
|
|
int rc;
|
|
uint64_t ui64;
|
|
bool msg_sent;
|
|
mca_oob_base_component_t *component;
|
|
bool reachable;
|
|
opal_list_t myvals;
|
|
opal_value_t *kv;
|
|
|
|
/* done with this. release it now */
|
|
OBJ_RELEASE(cd);
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:base:send to target %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&msg->dst));
|
|
|
|
/* check if we have this peer in our hash table */
|
|
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));
|
|
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
|
|
ui64, (void**)&pr) ||
|
|
NULL == pr) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:base:send unknown peer %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&msg->dst));
|
|
/* for direct launched procs, the URI might be in the database,
|
|
* so check there next - if it is, the peer object will be added
|
|
* to our hash table
|
|
*/
|
|
OBJ_CONSTRUCT(&myvals, opal_list_t);
|
|
if (OPAL_SUCCESS == opal_dstore.fetch(opal_dstore_internal,
|
|
(opal_identifier_t*)&msg->dst,
|
|
OPAL_DSTORE_URI, &myvals)) {
|
|
kv = (opal_value_t*)opal_list_get_first(&myvals);
|
|
if (NULL != kv) {
|
|
process_uri(kv->data.string);
|
|
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
|
|
ui64, (void**)&pr) ||
|
|
NULL == pr) {
|
|
/* that is just plain wrong */
|
|
ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
|
|
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
OPAL_LIST_DESTRUCT(&myvals);
|
|
return;
|
|
}
|
|
} else {
|
|
ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN);
|
|
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
OPAL_LIST_DESTRUCT(&myvals);
|
|
return;
|
|
}
|
|
OPAL_LIST_DESTRUCT(&myvals);
|
|
} else {
|
|
/* even though we don't know about this peer yet, we still might
|
|
* be able to get to it via routing, so ask each component if
|
|
* it can reach it
|
|
*/
|
|
reachable = false;
|
|
pr = NULL;
|
|
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
|
component = (mca_oob_base_component_t*)cli->cli_component;
|
|
if (NULL != component->is_reachable) {
|
|
if (component->is_reachable(&msg->dst)) {
|
|
/* there is a way to reach this peer - record it
|
|
* so we don't waste this time again
|
|
*/
|
|
if (NULL == pr) {
|
|
pr = OBJ_NEW(orte_oob_base_peer_t);
|
|
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
return;
|
|
}
|
|
}
|
|
/* mark that this component can reach the peer */
|
|
opal_bitmap_set_bit(&pr->addressable, component->idx);
|
|
/* flag that at least one component can reach this peer */
|
|
reachable = true;
|
|
}
|
|
}
|
|
}
|
|
/* if nobody could reach it, then that's an error */
|
|
if (!reachable) {
|
|
msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* if we already have a connection to this peer, use it */
|
|
if (NULL != pr->component) {
|
|
/* post this msg for send by this transport - the component
|
|
* runs on our event base, so we can just call their function
|
|
*/
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:base:send known transport for peer %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&msg->dst));
|
|
if (ORTE_SUCCESS == (rc = pr->component->send_nb(msg))) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* if we haven't identified a transport to this peer,
|
|
* loop across all available components in priority order until
|
|
* one replies that it has a module that can reach this peer.
|
|
* Let it try to make the connection
|
|
*/
|
|
msg_sent = false;
|
|
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
|
component = (mca_oob_base_component_t*)cli->cli_component;
|
|
/* is this peer addressable by this component? */
|
|
if (!opal_bitmap_is_set_bit(&pr->addressable, component->idx)) {
|
|
continue;
|
|
}
|
|
/* it is addressable, so attempt to send via that transport */
|
|
if (ORTE_SUCCESS == (rc = component->send_nb(msg))) {
|
|
/* the msg status will be set upon send completion/failure */
|
|
msg_sent = true;
|
|
/* point to this transport for any future messages */
|
|
pr->component = component;
|
|
break;
|
|
} else if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
|
|
/* components return "next option" if they can't connect
|
|
* to this peer. anything else is a true error.
|
|
*/
|
|
ORTE_ERROR_LOG(rc);
|
|
msg->status = rc;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* if no component can reach this peer, that's an error - post
|
|
* it back to the RML for handling
|
|
*/
|
|
if (!msg_sent) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:base:send no path to target %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&msg->dst));
|
|
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
|
|
ORTE_RML_SEND_COMPLETE(msg);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Obtain a uri for initial connection purposes
|
|
*
|
|
* During initial wireup, we can only transfer contact info on the daemon
|
|
* command line. This limits what we can send to a string representation of
|
|
* the actual contact info, which gets sent in a uri-like form. Not every
|
|
* oob module can support this transaction, so this function will loop
|
|
* across all oob components/modules, letting each add to the uri string if
|
|
* it supports bootstrap operations. An error will be returned in the cbfunc
|
|
* if NO component can successfully provide a contact.
|
|
*
|
|
* Note: since there is a limit to what an OS will allow on a cmd line, we
|
|
* impose a limit on the length of the resulting uri via an MCA param. The
|
|
* default value of -1 implies unlimited - however, users with large numbers
|
|
* of interfaces on their nodes may wish to restrict the size.
|
|
*/
|
|
void orte_oob_base_get_addr(char **uri)
|
|
{
|
|
char *turi, *final=NULL, *tmp;
|
|
size_t len = 0;
|
|
int rc=ORTE_SUCCESS;
|
|
bool one_added = false;
|
|
mca_base_component_list_item_t *cli;
|
|
mca_oob_base_component_t *component;
|
|
|
|
/* start with our process name */
|
|
if (ORTE_SUCCESS != (rc = orte_util_convert_process_name_to_string(&final, ORTE_PROC_MY_NAME))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
goto unblock;
|
|
}
|
|
len = strlen(final);
|
|
|
|
/* loop across all available modules to get their input
|
|
* up to the max length
|
|
*/
|
|
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
|
component = (mca_oob_base_component_t*)cli->cli_component;
|
|
/* ask the component for its input, obtained when it
|
|
* opened its modules
|
|
*/
|
|
if (NULL == component->get_addr) {
|
|
/* doesn't support this ability */
|
|
continue;
|
|
}
|
|
/* the components operate within our event base, so we
|
|
* can directly call their get_uri function to get the
|
|
* pointer to the uri - this is not a copy, so
|
|
* do NOT free it!
|
|
*/
|
|
turi = component->get_addr();
|
|
if (NULL != turi) {
|
|
/* check overall length for limits */
|
|
if (0 < orte_oob_base.max_uri_length &&
|
|
orte_oob_base.max_uri_length < (int)(len + strlen(turi))) {
|
|
/* cannot accept the payload */
|
|
continue;
|
|
}
|
|
/* add new value to final one */
|
|
asprintf(&tmp, "%s;%s", final, turi);
|
|
free(turi);
|
|
free(final);
|
|
final = tmp;
|
|
len = strlen(final);
|
|
/* flag that at least one contributed */
|
|
one_added = true;
|
|
}
|
|
}
|
|
|
|
if (!one_added) {
|
|
/* nobody could contribute */
|
|
if (NULL != final) {
|
|
free(final);
|
|
final = NULL;
|
|
}
|
|
}
|
|
|
|
unblock:
|
|
*uri = final;
|
|
}
|
|
|
|
/**
|
|
* This function will loop
|
|
* across all oob components, letting each look at the uri and extract
|
|
* info from it if it can. An error is to be returned if NO component
|
|
* can successfully extract a contact.
|
|
*/
|
|
static void req_cons(mca_oob_uri_req_t *ptr)
|
|
{
|
|
ptr->uri = NULL;
|
|
}
|
|
static void req_des(mca_oob_uri_req_t *ptr)
|
|
{
|
|
if (NULL != ptr->uri) {
|
|
free(ptr->uri);
|
|
}
|
|
}
|
|
OBJ_CLASS_INSTANCE(mca_oob_uri_req_t,
|
|
opal_object_t,
|
|
req_cons, req_des);
|
|
|
|
void orte_oob_base_set_addr(int fd, short args, void *cbdata)
|
|
{
|
|
mca_oob_uri_req_t *req = (mca_oob_uri_req_t*)cbdata;
|
|
char *uri = req->uri;
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s: set_addr to uri %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
(NULL == uri) ? "NULL" : uri);
|
|
|
|
/* if the request doesn't contain a URI, then we
|
|
* have an error
|
|
*/
|
|
if (NULL == uri) {
|
|
opal_output(0, "%s: NULL URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
|
ORTE_FORCED_TERMINATE(1);
|
|
OBJ_RELEASE(req);
|
|
return;
|
|
}
|
|
|
|
process_uri(uri);
|
|
OBJ_RELEASE(req);
|
|
}
|
|
|
|
static void process_uri(char *uri)
|
|
{
|
|
orte_process_name_t peer;
|
|
char *cptr;
|
|
mca_base_component_list_item_t *cli;
|
|
mca_oob_base_component_t *component;
|
|
char **uris=NULL;
|
|
int rc;
|
|
uint64_t ui64;
|
|
orte_oob_base_peer_t *pr;
|
|
|
|
/* find the first semi-colon in the string */
|
|
cptr = strchr(uri, ';');
|
|
if (NULL == cptr) {
|
|
/* got a problem - there must be at least two fields,
|
|
* the first containing the process name of our peer
|
|
* and all others containing the OOB contact info
|
|
*/
|
|
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
|
return;
|
|
}
|
|
*cptr = '\0';
|
|
cptr++;
|
|
|
|
/* the first field is the process name, so convert it */
|
|
orte_util_convert_string_to_process_name(&peer, uri);
|
|
|
|
/* if the peer is us, no need to go further as we already
|
|
* know our own contact info
|
|
*/
|
|
if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
|
|
peer.vpid == ORTE_PROC_MY_NAME->vpid) {
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s:set_addr peer %s is me",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&peer));
|
|
return;
|
|
}
|
|
|
|
/* split the rest of the uri into component parts */
|
|
uris = opal_argv_split(cptr, ';');
|
|
|
|
/* get the peer object for this process */
|
|
memcpy(&ui64, (char*)&peer, sizeof(uint64_t));
|
|
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
|
|
ui64, (void**)&pr) ||
|
|
NULL == pr) {
|
|
pr = OBJ_NEW(orte_oob_base_peer_t);
|
|
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
opal_argv_free(uris);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* loop across all available components and let them extract
|
|
* whatever piece(s) of the uri they find relevant - they
|
|
* are all operating on our event base, so we can just
|
|
* directly call their functions
|
|
*/
|
|
rc = ORTE_ERR_UNREACH;
|
|
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
|
component = (mca_oob_base_component_t*)cli->cli_component;
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s:set_addr checking if peer %s is reachable via component %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
|
|
if (NULL != component->set_addr) {
|
|
if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
|
|
/* this component found reachable addresses
|
|
* in the uris
|
|
*/
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s: peer %s is reachable via component %s",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
|
|
opal_bitmap_set_bit(&pr->addressable, component->idx);
|
|
}
|
|
}
|
|
}
|
|
opal_argv_free(uris);
|
|
}
|
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
void orte_oob_base_ft_event(int sd, short argc, void *cbdata)
|
|
{
|
|
int rc;
|
|
mca_base_component_list_item_t *cli;
|
|
mca_oob_base_component_t *component;
|
|
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
|
|
|
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
|
"%s oob:base:ft_event %s(%d)",
|
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
orte_job_state_to_str(state->job_state),
|
|
state->job_state);
|
|
|
|
/* loop across all available modules in priority order
|
|
* and call each one's ft_event handler
|
|
*/
|
|
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
|
component = (mca_oob_base_component_t*)cli->cli_component;
|
|
if (NULL == component->ft_event) {
|
|
/* doesn't support this ability */
|
|
continue;
|
|
}
|
|
|
|
if (ORTE_SUCCESS != (rc = component->ft_event(state->job_state))) {
|
|
ORTE_ERROR_LOG(rc);
|
|
}
|
|
}
|
|
OBJ_RELEASE(state);
|
|
}
|
|
|
|
#endif
|