1
1
Ralph Castain 497c7e6abb Fixes trac:2904
The intercomm "merge" function can create a linkage between procs that was not reflected anywhere in a modex, and so at least some of the procs in the resulting communicator don't know how to talk to some of the new communicator's peers.

For example, consider the case where:

1. parent job A comm_spawns a process (job B) - these processes exchange modex and can communicate

2. parent job A now comm_spawns another process (job C) - again, these can communicate, but the proc in C knows nothing of B

3. do an intercomm merge across the communicators created by the two comm_spawns. This puts B and C into the same communicator, but they know nothing about how to talk to each other as they were not involved in any exchange of contact info. Hence, collectives on that communicator now fail. 

This fix adds an API to the ompi/dpm framework that (a) exchanges the modex info across the procs in the merge to ensure all procs know how to communicate, and (b) calls add_procs to give the btl's a chance to select transports to any new procs.

cmr:v1.7.3:reviewer=jsquyres

This commit was SVN r29166.

The following Trac tickets were found above:
  Ticket 2904 --> https://svn.open-mpi.org/trac/ompi/ticket/2904
2013-09-15 15:00:40 +00:00

102 строки
3.8 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_DPM_BASE_H
#define OMPI_MCA_DPM_BASE_H
#include "ompi_config.h"
#include "ompi/constants.h"
#if HAVE_TIME_H
#include <time.h>
#endif
#if HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "ompi/mca/dpm/dpm.h"
/*
* Global functions for MCA overall DPM
*/
BEGIN_C_DECLS
struct ompi_dpm_base_disconnect_obj {
ompi_communicator_t *comm;
int size;
struct ompi_request_t **reqs;
int buf;
};
typedef struct ompi_dpm_base_disconnect_obj ompi_dpm_base_disconnect_obj;
/**
* Select an available component.
*
* @retval OMPI_SUCCESS Upon Success
* @retval OMPI_NOT_FOUND If no component can be selected
* @retval OMPI_ERROR Upon other failure
*
*/
OMPI_DECLSPEC int ompi_dpm_base_select(void);
/* Internal support functions */
OMPI_DECLSPEC char* ompi_dpm_base_dyn_init (void);
OMPI_DECLSPEC int ompi_dpm_base_dyn_finalize (void);
OMPI_DECLSPEC void ompi_dpm_base_mark_dyncomm (ompi_communicator_t *comm);
OMPI_DECLSPEC ompi_dpm_base_disconnect_obj *ompi_dpm_base_disconnect_init ( ompi_communicator_t *comm);
OMPI_DECLSPEC int ompi_dpm_base_disconnect_waitall (int count, ompi_dpm_base_disconnect_obj **objs);
/* NULL component functions */
int ompi_dpm_base_null_connect_accept (ompi_communicator_t *comm, int root,
char *port_string, bool send_first,
ompi_communicator_t **newcomm);
int ompi_dpm_base_null_merge(ompi_communicator_t *comm, int root);
int ompi_dpm_base_null_disconnect(ompi_communicator_t *comm);
int ompi_dpm_base_null_spawn(int count, char **array_of_commands,
char ***array_of_argv,
int *array_of_maxprocs,
MPI_Info *array_of_info,
char *port_name);
int ompi_dpm_base_null_dyn_init(void);
int ompi_dpm_base_null_dyn_finalize (void);
void ompi_dpm_base_null_mark_dyncomm (ompi_communicator_t *comm);
int ompi_dpm_base_null_open_port(char *port_name, ompi_rml_tag_t given_tag);
int ompi_dpm_base_null_parse_port(char *port_name,
char **hnp_uri, char **rml_uri, ompi_rml_tag_t *tag);
int ompi_dpm_base_null_route_to_port(char *rml_uri, ompi_process_name_t *rproc);
int ompi_dpm_base_null_close_port(char *port_name);
int ompi_dpm_base_null_pconnect(char *port,
struct timeval *timeout,
ompi_dpm_base_paccept_connect_callback_fn_t cbfunc,
void *cbdata);
int ompi_dpm_base_null_paccept(char *port,
ompi_dpm_base_paccept_connect_callback_fn_t cbfunc,
void *cbdata);
void ompi_dpm_base_null_pclose(char *port);
/* useful globals */
OMPI_DECLSPEC extern ompi_dpm_base_component_t ompi_dpm_base_selected_component;
OMPI_DECLSPEC extern ompi_dpm_base_module_t ompi_dpm;
OMPI_DECLSPEC extern mca_base_framework_t ompi_dpm_base_framework;
END_C_DECLS
#endif /* OMPI_MCA_DPM_BASE_H */