1
1

***** THIS INCLUDES A SMALL CHANGE IN THE MPI-RTE INTERFACE *****

Fix two problems that surfaced when using direct launch under SLURM:

1. locally store our own data because some BTLs want to retrieve 
   it during add_procs rather than use what they have internally

2. cleanup MPI_Abort so it correctly passes the error status all
   the way down to the actual exit. When someone implemented the
   "abort_peers" API, they left out the error status. So we lost
   it at that point and *always* exited with a status of 1. This 
   forces a change to the API to include the status.

cmr:v1.7.3:reviewer=jsquyres:subject=Fix MPI_Abort and modex_recv for direct launch

This commit was SVN r29405.
Этот коммит содержится в:
Ralph Castain 2013-10-08 18:37:59 +00:00
родитель 7de2179866
Коммит 9902748108
17 изменённых файлов: 147 добавлений и 58 удалений

Просмотреть файл

@ -71,7 +71,7 @@ typedef orte_local_rank_t ompi_local_rank_t;
/* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
#define ompi_rte_abort_peers(a, b) orte_errmgr.abort_peers(a, b)
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND

Просмотреть файл

@ -103,7 +103,7 @@ OMPI_DECLSPEC extern bool ompi_rte_proc_is_bound;
/* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs);
OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status);
OMPI_DECLSPEC int ompi_rte_error_log(const char *file, int line,
const char *func, int ret);
#define OMPI_ERROR_LOG(ret) ompi_rte_error_log(__FILE__, __LINE__, __func__, ret)

Просмотреть файл

@ -45,9 +45,9 @@ ompi_rte_abort(int error_code, char *fmt, ...)
int
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs)
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status)
{
PMI_Abort(1, "");
PMI_Abort(status, "");
return OMPI_SUCCESS;
}

Просмотреть файл

@ -194,8 +194,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/*
* Abort peers in this communicator group. Does not include self.
*/
if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs)) ) {
ompi_rte_abort(ret, "Open MPI failed to abort all of the procs requested (%d).", ret);
if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs, errcode)) ) {
ompi_rte_abort(errcode, "Open MPI failed to abort all of the procs requested (%d).", ret);
}
}

Просмотреть файл

@ -345,12 +345,17 @@ static int fetch(const opal_identifier_t *uid,
/* lookup the proc data object for this proc */
if (NULL == (proc_data = lookup_opal_proc(&hash_data, id))) {
/* maybe they can find it elsewhere */
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
"db_hash:fetch data for proc %" PRIu64 " not found", id));
return OPAL_ERR_TAKE_NEXT_OPTION;
}
/* find the value */
if (NULL == (kv = lookup_keyval(proc_data, key))) {
/* let them look globally for it */
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
"db_hash:fetch key %s for proc %" PRIu64 " not found",
(NULL == key) ? "NULL" : key, id));
return OPAL_ERR_TAKE_NEXT_OPTION;
}

Просмотреть файл

@ -384,6 +384,7 @@ static int store(const opal_identifier_t *uid,
const char *key, const void *data, opal_data_type_t type)
{
opal_identifier_t proc;
int rc;
/* to protect alignment, copy the data across */
memcpy(&proc, uid, sizeof(opal_identifier_t));
@ -398,7 +399,16 @@ static int store(const opal_identifier_t *uid,
"db:pmi:store: storing key %s[%s] for proc %" PRIu64 "",
key, opal_dss.lookup_data_type(type), proc));
return pmi_store_encoded (uid, key, data, type);
if (OPAL_SUCCESS != (rc = pmi_store_encoded (uid, key, data, type))) {
OPAL_ERROR_LOG(rc);
return rc;
}
/* we want our internal data to be stored internally
* as well since some of the upper layer components
* want to retrieve it
*/
return OPAL_ERR_TAKE_NEXT_OPTION;
}
static int store_pointer(const opal_identifier_t *uid,
@ -419,8 +429,14 @@ static int store_pointer(const opal_identifier_t *uid,
/* just push this to PMI */
if (OPAL_SUCCESS != (rc = store(uid, kv->scope, kv->key, (void*)&kv->data, kv->type))) {
OPAL_ERROR_LOG(rc);
return rc;
}
return rc;
/* we want our internal data to be stored internally
* as well since some of the upper layer components
* want to retrieve it
*/
return OPAL_ERR_TAKE_NEXT_OPTION;
}
static void commit(const opal_identifier_t *proc)

Просмотреть файл

@ -240,7 +240,9 @@ void orte_errmgr_base_register_migration_warning(struct timeval *tv)
return;
}
int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}

Просмотреть файл

@ -72,7 +72,8 @@ ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
__opal_attribute_format__(__printf__, 2, 3);
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
orte_std_cntr_t num_procs,
int error_code);
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);

Просмотреть файл

@ -48,7 +48,8 @@ static int init(void);
static int finalize(void);
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
orte_std_cntr_t num_procs,
int error_code);
/******************
* HNP module
@ -131,14 +132,16 @@ static void proc_errors(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy);
}
static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
{
/* just abort */
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, "%s called abort_peers",
orte_errmgr_base_abort(error_code, "%s called abort_peers",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
} else {
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
orte_errmgr_base_abort(error_code, NULL);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -177,7 +178,8 @@ __opal_attribute_format_funcptr__(__printf__, 2, 3);
* communicator group before aborting itself.
*/
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
orte_std_cntr_t num_procs,
int error_code);
/**
* Predicted process/node failure notification

Просмотреть файл

@ -290,30 +290,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
goto error;
}
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
coll.active = true;
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
return ORTE_SUCCESS;
error:

24
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -162,6 +162,30 @@ static int rte_init(void)
goto error;
}
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
coll.active = true;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
return ORTE_SUCCESS;
error:

Просмотреть файл

@ -53,6 +53,7 @@
#include "opal/mca/db/db.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
@ -387,6 +388,30 @@ static int rte_init(void)
goto error;
}
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
coll.active = true;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
/* flag that we completed init */
app_init_complete = true;
@ -446,7 +471,26 @@ static int rte_finalize(void)
return ORTE_SUCCESS;
}
static void rte_abort(int error_code, bool report)
static void rte_abort(int status, bool report)
{
orte_ess_base_app_abort(error_code, report);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"%s ess:pmi:abort: abort with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
status));
/* PMI doesn't like NULL messages, but our interface
* doesn't provide one - so rig one up here
*/
#if WANT_PMI2_SUPPORT
PMI2_Abort(status, "N/A");
#else
PMI_Abort(status, "N/A");
#endif
/* - Clean out the global structures
* (not really necessary, but good practice) */
orte_proc_info_finalize();
/* Now Exit */
exit(status);
}

Просмотреть файл

@ -80,8 +80,8 @@ int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
{
/* only use PMI when direct launched */
if (NULL == orte_process_info.my_hnp_uri &&
ORTE_PROC_IS_MPI &&
mca_common_pmi_init ()) {
ORTE_PROC_IS_APP &&
mca_common_pmi_init ()) {
/* if PMI is available, make it available for use by MPI procs */
*priority = my_priority;
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;

Просмотреть файл

@ -6,11 +6,20 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
int main(int argc, char* argv[])
{
int rank, size;
int errcode;
if (1 < argc) {
errcode = strtol(argv[1], NULL, 10);
} else {
errcode = 2;
}
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@ -18,8 +27,12 @@ int main(int argc, char* argv[])
printf("Hello, World, I am %d of %d\n", rank, size);
if (1 == rank) MPI_Abort(MPI_COMM_WORLD, 2);
if (1 == rank) {
MPI_Abort(MPI_COMM_WORLD, errcode);
} else {
errcode = 0;
}
MPI_Finalize();
return 0;
return errcode;
}

Просмотреть файл

@ -38,7 +38,8 @@ int main(int argc, char* argv[])
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
hostname, (long)pid);
fflush(stdout);
i = 0;
while (1) {
i++;

Просмотреть файл

@ -51,20 +51,22 @@ int main(int argc, char **argv, char **envp)
goto done;
}
done:
if (PMI_TRUE == pmi_initialized) {
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 10000) i = 0;
if ((pmi_rank == 3 ||
(pmi_process_group_size <= 3 && pmi_rank == 0))
&& i == 9995) {
PMI_Abort(rc, "RANK0 CALLED ABORT");
}
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 10000) i = 0;
if ((pmi_rank == 3 ||
(pmi_process_group_size <= 3 && pmi_rank == 0))
&& i == 9995) {
asprintf(&err, "RANK%d CALLED ABORT", pmi_rank);
fprintf(stderr, "%s\n", err);
fflush(stderr);
PMI_Abort(rc, err);
}
}
done:
if (NULL != err) {
fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err);
rc = EXIT_FAILURE;