1
1

***** THIS INCLUDES A SMALL CHANGE IN THE MPI-RTE INTERFACE *****

Fix two problems that surfaced when using direct launch under SLURM:

1. locally store our own data because some BTLs want to retrieve 
   it during add_procs rather than use what they have internally

2. cleanup MPI_Abort so it correctly passes the error status all
   the way down to the actual exit. When someone implemented the
   "abort_peers" API, they left out the error status. So we lost
   it at that point and *always* exited with a status of 1. This 
   forces a change to the API to include the status.

cmr:v1.7.3:reviewer=jsquyres:subject=Fix MPI_Abort and modex_recv for direct launch

This commit was SVN r29405.
Этот коммит содержится в:
Ralph Castain 2013-10-08 18:37:59 +00:00
родитель 7de2179866
Коммит 9902748108
17 изменённых файлов: 147 добавлений и 58 удалений

Просмотреть файл

@ -71,7 +71,7 @@ typedef orte_local_rank_t ompi_local_rank_t;
/* Error handling objects and operations */ /* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...); OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
#define ompi_rte_abort_peers(a, b) orte_errmgr.abort_peers(a, b) #define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST #define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST #define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND #define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND

Просмотреть файл

@ -103,7 +103,7 @@ OMPI_DECLSPEC extern bool ompi_rte_proc_is_bound;
/* Error handling objects and operations */ /* Error handling objects and operations */
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...); OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs); OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status);
OMPI_DECLSPEC int ompi_rte_error_log(const char *file, int line, OMPI_DECLSPEC int ompi_rte_error_log(const char *file, int line,
const char *func, int ret); const char *func, int ret);
#define OMPI_ERROR_LOG(ret) ompi_rte_error_log(__FILE__, __LINE__, __func__, ret) #define OMPI_ERROR_LOG(ret) ompi_rte_error_log(__FILE__, __LINE__, __func__, ret)

Просмотреть файл

@ -45,9 +45,9 @@ ompi_rte_abort(int error_code, char *fmt, ...)
int int
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status)
{ {
PMI_Abort(1, ""); PMI_Abort(status, "");
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -194,8 +194,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
/* /*
* Abort peers in this communicator group. Does not include self. * Abort peers in this communicator group. Does not include self.
*/ */
if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs)) ) { if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs, errcode)) ) {
ompi_rte_abort(ret, "Open MPI failed to abort all of the procs requested (%d).", ret); ompi_rte_abort(errcode, "Open MPI failed to abort all of the procs requested (%d).", ret);
} }
} }

Просмотреть файл

@ -345,12 +345,17 @@ static int fetch(const opal_identifier_t *uid,
/* lookup the proc data object for this proc */ /* lookup the proc data object for this proc */
if (NULL == (proc_data = lookup_opal_proc(&hash_data, id))) { if (NULL == (proc_data = lookup_opal_proc(&hash_data, id))) {
/* maybe they can find it elsewhere */ /* maybe they can find it elsewhere */
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
"db_hash:fetch data for proc %" PRIu64 " not found", id));
return OPAL_ERR_TAKE_NEXT_OPTION; return OPAL_ERR_TAKE_NEXT_OPTION;
} }
/* find the value */ /* find the value */
if (NULL == (kv = lookup_keyval(proc_data, key))) { if (NULL == (kv = lookup_keyval(proc_data, key))) {
/* let them look globally for it */ /* let them look globally for it */
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
"db_hash:fetch key %s for proc %" PRIu64 " not found",
(NULL == key) ? "NULL" : key, id));
return OPAL_ERR_TAKE_NEXT_OPTION; return OPAL_ERR_TAKE_NEXT_OPTION;
} }

Просмотреть файл

@ -384,6 +384,7 @@ static int store(const opal_identifier_t *uid,
const char *key, const void *data, opal_data_type_t type) const char *key, const void *data, opal_data_type_t type)
{ {
opal_identifier_t proc; opal_identifier_t proc;
int rc;
/* to protect alignment, copy the data across */ /* to protect alignment, copy the data across */
memcpy(&proc, uid, sizeof(opal_identifier_t)); memcpy(&proc, uid, sizeof(opal_identifier_t));
@ -398,7 +399,16 @@ static int store(const opal_identifier_t *uid,
"db:pmi:store: storing key %s[%s] for proc %" PRIu64 "", "db:pmi:store: storing key %s[%s] for proc %" PRIu64 "",
key, opal_dss.lookup_data_type(type), proc)); key, opal_dss.lookup_data_type(type), proc));
return pmi_store_encoded (uid, key, data, type); if (OPAL_SUCCESS != (rc = pmi_store_encoded (uid, key, data, type))) {
OPAL_ERROR_LOG(rc);
return rc;
}
/* we want our internal data to be stored internally
* as well since some of the upper layer components
* want to retrieve it
*/
return OPAL_ERR_TAKE_NEXT_OPTION;
} }
static int store_pointer(const opal_identifier_t *uid, static int store_pointer(const opal_identifier_t *uid,
@ -419,8 +429,14 @@ static int store_pointer(const opal_identifier_t *uid,
/* just push this to PMI */ /* just push this to PMI */
if (OPAL_SUCCESS != (rc = store(uid, kv->scope, kv->key, (void*)&kv->data, kv->type))) { if (OPAL_SUCCESS != (rc = store(uid, kv->scope, kv->key, (void*)&kv->data, kv->type))) {
OPAL_ERROR_LOG(rc); OPAL_ERROR_LOG(rc);
return rc;
} }
return rc;
/* we want our internal data to be stored internally
* as well since some of the upper layer components
* want to retrieve it
*/
return OPAL_ERR_TAKE_NEXT_OPTION;
} }
static void commit(const opal_identifier_t *proc) static void commit(const opal_identifier_t *proc)

Просмотреть файл

@ -240,7 +240,9 @@ void orte_errmgr_base_register_migration_warning(struct timeval *tv)
return; return;
} }
int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
{ {
return ORTE_ERR_NOT_IMPLEMENTED; return ORTE_ERR_NOT_IMPLEMENTED;
} }

Просмотреть файл

@ -72,7 +72,8 @@ ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...) ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
__opal_attribute_format__(__printf__, 2, 3); __opal_attribute_format__(__printf__, 2, 3);
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs, ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs); orte_std_cntr_t num_procs,
int error_code);
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv); ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);

Просмотреть файл

@ -48,7 +48,8 @@ static int init(void);
static int finalize(void); static int finalize(void);
static int abort_peers(orte_process_name_t *procs, static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs); orte_std_cntr_t num_procs,
int error_code);
/****************** /******************
* HNP module * HNP module
@ -131,14 +132,16 @@ static void proc_errors(int fd, short args, void *cbdata)
OBJ_RELEASE(caddy); OBJ_RELEASE(caddy);
} }
static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
{ {
/* just abort */ /* just abort */
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) { if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, "%s called abort_peers", orte_errmgr_base_abort(error_code, "%s called abort_peers",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
} else { } else {
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL); orte_errmgr_base_abort(error_code, NULL);
} }
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -177,7 +178,8 @@ __opal_attribute_format_funcptr__(__printf__, 2, 3);
* communicator group before aborting itself. * communicator group before aborting itself.
*/ */
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
orte_std_cntr_t num_procs); orte_std_cntr_t num_procs,
int error_code);
/** /**
* Predicted process/node failure notification * Predicted process/node failure notification

Просмотреть файл

@ -290,30 +290,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
goto error; goto error;
} }
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
coll.active = true;
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:

24
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -162,6 +162,30 @@ static int rte_init(void)
goto error; goto error;
} }
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
coll.active = true;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
error: error:

Просмотреть файл

@ -53,6 +53,7 @@
#include "opal/mca/db/db.h" #include "opal/mca/db/db.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/show_help.h" #include "orte/util/show_help.h"
@ -387,6 +388,30 @@ static int rte_init(void)
goto error; goto error;
} }
/* if we are an ORTE app - and not an MPI app - then
* we need to exchange our connection info here.
* MPI_Init has its own modex, so we don't need to do
* two of them. However, if we don't do a modex at all,
* then processes have no way to communicate
*
* NOTE: only do this when the process originally launches.
* Cannot do this on a restart as the rest of the processes
* in the job won't be executing this step, so we would hang
*/
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
orte_grpcomm_collective_t coll;
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
coll.id = orte_process_info.peer_modex;
coll.active = true;
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
ORTE_ERROR_LOG(ret);
error = "orte modex";
goto error;
}
ORTE_WAIT_FOR_COMPLETION(coll.active);
OBJ_DESTRUCT(&coll);
}
/* flag that we completed init */ /* flag that we completed init */
app_init_complete = true; app_init_complete = true;
@ -446,7 +471,26 @@ static int rte_finalize(void)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static void rte_abort(int error_code, bool report) static void rte_abort(int status, bool report)
{ {
orte_ess_base_app_abort(error_code, report); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
"%s ess:pmi:abort: abort with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
status));
/* PMI doesn't like NULL messages, but our interface
* doesn't provide one - so rig one up here
*/
#if WANT_PMI2_SUPPORT
PMI2_Abort(status, "N/A");
#else
PMI_Abort(status, "N/A");
#endif
/* - Clean out the global structures
* (not really necessary, but good practice) */
orte_proc_info_finalize();
/* Now Exit */
exit(status);
} }

Просмотреть файл

@ -80,8 +80,8 @@ int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
{ {
/* only use PMI when direct launched */ /* only use PMI when direct launched */
if (NULL == orte_process_info.my_hnp_uri && if (NULL == orte_process_info.my_hnp_uri &&
ORTE_PROC_IS_MPI && ORTE_PROC_IS_APP &&
mca_common_pmi_init ()) { mca_common_pmi_init ()) {
/* if PMI is available, make it available for use by MPI procs */ /* if PMI is available, make it available for use by MPI procs */
*priority = my_priority; *priority = my_priority;
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module; *module = (mca_base_module_t *)&orte_grpcomm_pmi_module;

Просмотреть файл

@ -6,11 +6,20 @@
*/ */
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include "mpi.h" #include "mpi.h"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
int rank, size; int rank, size;
int errcode;
if (1 < argc) {
errcode = strtol(argv[1], NULL, 10);
} else {
errcode = 2;
}
MPI_Init(&argc, &argv); MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@ -18,8 +27,12 @@ int main(int argc, char* argv[])
printf("Hello, World, I am %d of %d\n", rank, size); printf("Hello, World, I am %d of %d\n", rank, size);
if (1 == rank) MPI_Abort(MPI_COMM_WORLD, 2); if (1 == rank) {
MPI_Abort(MPI_COMM_WORLD, errcode);
} else {
errcode = 0;
}
MPI_Finalize(); MPI_Finalize();
return 0; return errcode;
} }

Просмотреть файл

@ -38,7 +38,8 @@ int main(int argc, char* argv[])
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
hostname, (long)pid); hostname, (long)pid);
fflush(stdout);
i = 0; i = 0;
while (1) { while (1) {
i++; i++;

Просмотреть файл

@ -51,20 +51,22 @@ int main(int argc, char **argv, char **envp)
goto done; goto done;
} }
done: i = 0;
if (PMI_TRUE == pmi_initialized) { while (1) {
i = 0; i++;
while (1) { pi = i / 3.14159256;
i++; if (i > 10000) i = 0;
pi = i / 3.14159256; if ((pmi_rank == 3 ||
if (i > 10000) i = 0; (pmi_process_group_size <= 3 && pmi_rank == 0))
if ((pmi_rank == 3 || && i == 9995) {
(pmi_process_group_size <= 3 && pmi_rank == 0)) asprintf(&err, "RANK%d CALLED ABORT", pmi_rank);
&& i == 9995) { fprintf(stderr, "%s\n", err);
PMI_Abort(rc, "RANK0 CALLED ABORT"); fflush(stderr);
} PMI_Abort(rc, err);
} }
} }
done:
if (NULL != err) { if (NULL != err) {
fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err); fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err);
rc = EXIT_FAILURE; rc = EXIT_FAILURE;