***** THIS INCLUDES A SMALL CHANGE IN THE MPI-RTE INTERFACE *****
Fix two problems that surfaced when using direct launch under SLURM: 1. locally store our own data because some BTLs want to retrieve it during add_procs rather than use what they have internally 2. cleanup MPI_Abort so it correctly passes the error status all the way down to the actual exit. When someone implemented the "abort_peers" API, they left out the error status. So we lost it at that point and *always* exited with a status of 1. This forces a change to the API to include the status. cmr:v1.7.3:reviewer=jsquyres:subject=Fix MPI_Abort and modex_recv for direct launch This commit was SVN r29405.
Этот коммит содержится в:
родитель
7de2179866
Коммит
9902748108
@ -71,7 +71,7 @@ typedef orte_local_rank_t ompi_local_rank_t;
|
|||||||
|
|
||||||
/* Error handling objects and operations */
|
/* Error handling objects and operations */
|
||||||
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
|
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
|
||||||
#define ompi_rte_abort_peers(a, b) orte_errmgr.abort_peers(a, b)
|
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
|
||||||
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
|
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
|
||||||
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
|
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
|
||||||
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
|
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
|
||||||
|
@ -103,7 +103,7 @@ OMPI_DECLSPEC extern bool ompi_rte_proc_is_bound;
|
|||||||
|
|
||||||
/* Error handling objects and operations */
|
/* Error handling objects and operations */
|
||||||
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
|
OMPI_DECLSPEC void ompi_rte_abort(int error_code, char *fmt, ...);
|
||||||
OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs);
|
OMPI_DECLSPEC int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status);
|
||||||
OMPI_DECLSPEC int ompi_rte_error_log(const char *file, int line,
|
OMPI_DECLSPEC int ompi_rte_error_log(const char *file, int line,
|
||||||
const char *func, int ret);
|
const char *func, int ret);
|
||||||
#define OMPI_ERROR_LOG(ret) ompi_rte_error_log(__FILE__, __LINE__, __func__, ret)
|
#define OMPI_ERROR_LOG(ret) ompi_rte_error_log(__FILE__, __LINE__, __func__, ret)
|
||||||
|
@ -45,9 +45,9 @@ ompi_rte_abort(int error_code, char *fmt, ...)
|
|||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs)
|
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs, int status)
|
||||||
{
|
{
|
||||||
PMI_Abort(1, "");
|
PMI_Abort(status, "");
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -194,8 +194,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
/*
|
/*
|
||||||
* Abort peers in this communicator group. Does not include self.
|
* Abort peers in this communicator group. Does not include self.
|
||||||
*/
|
*/
|
||||||
if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs)) ) {
|
if( OMPI_SUCCESS != (ret = ompi_rte_abort_peers(abort_procs, nabort_procs, errcode)) ) {
|
||||||
ompi_rte_abort(ret, "Open MPI failed to abort all of the procs requested (%d).", ret);
|
ompi_rte_abort(errcode, "Open MPI failed to abort all of the procs requested (%d).", ret);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -345,12 +345,17 @@ static int fetch(const opal_identifier_t *uid,
|
|||||||
/* lookup the proc data object for this proc */
|
/* lookup the proc data object for this proc */
|
||||||
if (NULL == (proc_data = lookup_opal_proc(&hash_data, id))) {
|
if (NULL == (proc_data = lookup_opal_proc(&hash_data, id))) {
|
||||||
/* maybe they can find it elsewhere */
|
/* maybe they can find it elsewhere */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
|
||||||
|
"db_hash:fetch data for proc %" PRIu64 " not found", id));
|
||||||
return OPAL_ERR_TAKE_NEXT_OPTION;
|
return OPAL_ERR_TAKE_NEXT_OPTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* find the value */
|
/* find the value */
|
||||||
if (NULL == (kv = lookup_keyval(proc_data, key))) {
|
if (NULL == (kv = lookup_keyval(proc_data, key))) {
|
||||||
/* let them look globally for it */
|
/* let them look globally for it */
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, opal_db_base_framework.framework_output,
|
||||||
|
"db_hash:fetch key %s for proc %" PRIu64 " not found",
|
||||||
|
(NULL == key) ? "NULL" : key, id));
|
||||||
return OPAL_ERR_TAKE_NEXT_OPTION;
|
return OPAL_ERR_TAKE_NEXT_OPTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -384,6 +384,7 @@ static int store(const opal_identifier_t *uid,
|
|||||||
const char *key, const void *data, opal_data_type_t type)
|
const char *key, const void *data, opal_data_type_t type)
|
||||||
{
|
{
|
||||||
opal_identifier_t proc;
|
opal_identifier_t proc;
|
||||||
|
int rc;
|
||||||
|
|
||||||
/* to protect alignment, copy the data across */
|
/* to protect alignment, copy the data across */
|
||||||
memcpy(&proc, uid, sizeof(opal_identifier_t));
|
memcpy(&proc, uid, sizeof(opal_identifier_t));
|
||||||
@ -398,7 +399,16 @@ static int store(const opal_identifier_t *uid,
|
|||||||
"db:pmi:store: storing key %s[%s] for proc %" PRIu64 "",
|
"db:pmi:store: storing key %s[%s] for proc %" PRIu64 "",
|
||||||
key, opal_dss.lookup_data_type(type), proc));
|
key, opal_dss.lookup_data_type(type), proc));
|
||||||
|
|
||||||
return pmi_store_encoded (uid, key, data, type);
|
if (OPAL_SUCCESS != (rc = pmi_store_encoded (uid, key, data, type))) {
|
||||||
|
OPAL_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we want our internal data to be stored internally
|
||||||
|
* as well since some of the upper layer components
|
||||||
|
* want to retrieve it
|
||||||
|
*/
|
||||||
|
return OPAL_ERR_TAKE_NEXT_OPTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int store_pointer(const opal_identifier_t *uid,
|
static int store_pointer(const opal_identifier_t *uid,
|
||||||
@ -419,8 +429,14 @@ static int store_pointer(const opal_identifier_t *uid,
|
|||||||
/* just push this to PMI */
|
/* just push this to PMI */
|
||||||
if (OPAL_SUCCESS != (rc = store(uid, kv->scope, kv->key, (void*)&kv->data, kv->type))) {
|
if (OPAL_SUCCESS != (rc = store(uid, kv->scope, kv->key, (void*)&kv->data, kv->type))) {
|
||||||
OPAL_ERROR_LOG(rc);
|
OPAL_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
return rc;
|
|
||||||
|
/* we want our internal data to be stored internally
|
||||||
|
* as well since some of the upper layer components
|
||||||
|
* want to retrieve it
|
||||||
|
*/
|
||||||
|
return OPAL_ERR_TAKE_NEXT_OPTION;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void commit(const opal_identifier_t *proc)
|
static void commit(const opal_identifier_t *proc)
|
||||||
|
@ -240,7 +240,9 @@ void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||||
|
orte_std_cntr_t num_procs,
|
||||||
|
int error_code)
|
||||||
{
|
{
|
||||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||||
}
|
}
|
||||||
|
@ -72,7 +72,8 @@ ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line
|
|||||||
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||||
__opal_attribute_format__(__printf__, 2, 3);
|
__opal_attribute_format__(__printf__, 2, 3);
|
||||||
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||||
orte_std_cntr_t num_procs);
|
orte_std_cntr_t num_procs,
|
||||||
|
int error_code);
|
||||||
|
|
||||||
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
||||||
|
|
||||||
|
@ -48,7 +48,8 @@ static int init(void);
|
|||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
|
|
||||||
static int abort_peers(orte_process_name_t *procs,
|
static int abort_peers(orte_process_name_t *procs,
|
||||||
orte_std_cntr_t num_procs);
|
orte_std_cntr_t num_procs,
|
||||||
|
int error_code);
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* HNP module
|
* HNP module
|
||||||
@ -131,14 +132,16 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
OBJ_RELEASE(caddy);
|
OBJ_RELEASE(caddy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
static int abort_peers(orte_process_name_t *procs,
|
||||||
|
orte_std_cntr_t num_procs,
|
||||||
|
int error_code)
|
||||||
{
|
{
|
||||||
/* just abort */
|
/* just abort */
|
||||||
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
|
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
|
||||||
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, "%s called abort_peers",
|
orte_errmgr_base_abort(error_code, "%s called abort_peers",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
} else {
|
} else {
|
||||||
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
orte_errmgr_base_abort(error_code, NULL);
|
||||||
}
|
}
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -177,7 +178,8 @@ __opal_attribute_format_funcptr__(__printf__, 2, 3);
|
|||||||
* communicator group before aborting itself.
|
* communicator group before aborting itself.
|
||||||
*/
|
*/
|
||||||
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
|
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
|
||||||
orte_std_cntr_t num_procs);
|
orte_std_cntr_t num_procs,
|
||||||
|
int error_code);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Predicted process/node failure notification
|
* Predicted process/node failure notification
|
||||||
|
@ -290,30 +290,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we are an ORTE app - and not an MPI app - then
|
|
||||||
* we need to exchange our connection info here.
|
|
||||||
* MPI_Init has its own modex, so we don't need to do
|
|
||||||
* two of them. However, if we don't do a modex at all,
|
|
||||||
* then processes have no way to communicate
|
|
||||||
*
|
|
||||||
* NOTE: only do this when the process originally launches.
|
|
||||||
* Cannot do this on a restart as the rest of the processes
|
|
||||||
* in the job won't be executing this step, so we would hang
|
|
||||||
*/
|
|
||||||
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
|
||||||
orte_grpcomm_collective_t coll;
|
|
||||||
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
|
||||||
coll.id = orte_process_info.peer_modex;
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
|
|
||||||
ORTE_ERROR_LOG(ret);
|
|
||||||
error = "orte modex";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
coll.active = true;
|
|
||||||
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
|
||||||
OBJ_DESTRUCT(&coll);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
24
orte/mca/ess/env/ess_env_module.c
поставляемый
24
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -162,6 +162,30 @@ static int rte_init(void)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if we are an ORTE app - and not an MPI app - then
|
||||||
|
* we need to exchange our connection info here.
|
||||||
|
* MPI_Init has its own modex, so we don't need to do
|
||||||
|
* two of them. However, if we don't do a modex at all,
|
||||||
|
* then processes have no way to communicate
|
||||||
|
*
|
||||||
|
* NOTE: only do this when the process originally launches.
|
||||||
|
* Cannot do this on a restart as the rest of the processes
|
||||||
|
* in the job won't be executing this step, so we would hang
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
||||||
|
orte_grpcomm_collective_t coll;
|
||||||
|
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||||
|
coll.id = orte_process_info.peer_modex;
|
||||||
|
coll.active = true;
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
error = "orte modex";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||||
|
OBJ_DESTRUCT(&coll);
|
||||||
|
}
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
@ -53,6 +53,7 @@
|
|||||||
|
|
||||||
#include "opal/mca/db/db.h"
|
#include "opal/mca/db/db.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/mca/grpcomm/grpcomm.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
@ -387,6 +388,30 @@ static int rte_init(void)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if we are an ORTE app - and not an MPI app - then
|
||||||
|
* we need to exchange our connection info here.
|
||||||
|
* MPI_Init has its own modex, so we don't need to do
|
||||||
|
* two of them. However, if we don't do a modex at all,
|
||||||
|
* then processes have no way to communicate
|
||||||
|
*
|
||||||
|
* NOTE: only do this when the process originally launches.
|
||||||
|
* Cannot do this on a restart as the rest of the processes
|
||||||
|
* in the job won't be executing this step, so we would hang
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
||||||
|
orte_grpcomm_collective_t coll;
|
||||||
|
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||||
|
coll.id = orte_process_info.peer_modex;
|
||||||
|
coll.active = true;
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
error = "orte modex";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||||
|
OBJ_DESTRUCT(&coll);
|
||||||
|
}
|
||||||
|
|
||||||
/* flag that we completed init */
|
/* flag that we completed init */
|
||||||
app_init_complete = true;
|
app_init_complete = true;
|
||||||
|
|
||||||
@ -446,7 +471,26 @@ static int rte_finalize(void)
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rte_abort(int error_code, bool report)
|
static void rte_abort(int status, bool report)
|
||||||
{
|
{
|
||||||
orte_ess_base_app_abort(error_code, report);
|
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||||
|
"%s ess:pmi:abort: abort with status %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
status));
|
||||||
|
|
||||||
|
/* PMI doesn't like NULL messages, but our interface
|
||||||
|
* doesn't provide one - so rig one up here
|
||||||
|
*/
|
||||||
|
#if WANT_PMI2_SUPPORT
|
||||||
|
PMI2_Abort(status, "N/A");
|
||||||
|
#else
|
||||||
|
PMI_Abort(status, "N/A");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* - Clean out the global structures
|
||||||
|
* (not really necessary, but good practice) */
|
||||||
|
orte_proc_info_finalize();
|
||||||
|
|
||||||
|
/* Now Exit */
|
||||||
|
exit(status);
|
||||||
}
|
}
|
||||||
|
@ -80,8 +80,8 @@ int orte_grpcomm_pmi_component_query(mca_base_module_t **module, int *priority)
|
|||||||
{
|
{
|
||||||
/* only use PMI when direct launched */
|
/* only use PMI when direct launched */
|
||||||
if (NULL == orte_process_info.my_hnp_uri &&
|
if (NULL == orte_process_info.my_hnp_uri &&
|
||||||
ORTE_PROC_IS_MPI &&
|
ORTE_PROC_IS_APP &&
|
||||||
mca_common_pmi_init ()) {
|
mca_common_pmi_init ()) {
|
||||||
/* if PMI is available, make it available for use by MPI procs */
|
/* if PMI is available, make it available for use by MPI procs */
|
||||||
*priority = my_priority;
|
*priority = my_priority;
|
||||||
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
|
*module = (mca_base_module_t *)&orte_grpcomm_pmi_module;
|
||||||
|
@ -6,11 +6,20 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size;
|
||||||
|
int errcode;
|
||||||
|
|
||||||
|
if (1 < argc) {
|
||||||
|
errcode = strtol(argv[1], NULL, 10);
|
||||||
|
} else {
|
||||||
|
errcode = 2;
|
||||||
|
}
|
||||||
|
|
||||||
MPI_Init(&argc, &argv);
|
MPI_Init(&argc, &argv);
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
@ -18,8 +27,12 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
printf("Hello, World, I am %d of %d\n", rank, size);
|
printf("Hello, World, I am %d of %d\n", rank, size);
|
||||||
|
|
||||||
if (1 == rank) MPI_Abort(MPI_COMM_WORLD, 2);
|
if (1 == rank) {
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, errcode);
|
||||||
|
} else {
|
||||||
|
errcode = 0;
|
||||||
|
}
|
||||||
|
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
return 0;
|
return errcode;
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,7 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
printf("orte_abort: Name %s Host: %s Pid %ld\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
hostname, (long)pid);
|
hostname, (long)pid);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
while (1) {
|
while (1) {
|
||||||
|
@ -51,20 +51,22 @@ int main(int argc, char **argv, char **envp)
|
|||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
done:
|
i = 0;
|
||||||
if (PMI_TRUE == pmi_initialized) {
|
while (1) {
|
||||||
i = 0;
|
i++;
|
||||||
while (1) {
|
pi = i / 3.14159256;
|
||||||
i++;
|
if (i > 10000) i = 0;
|
||||||
pi = i / 3.14159256;
|
if ((pmi_rank == 3 ||
|
||||||
if (i > 10000) i = 0;
|
(pmi_process_group_size <= 3 && pmi_rank == 0))
|
||||||
if ((pmi_rank == 3 ||
|
&& i == 9995) {
|
||||||
(pmi_process_group_size <= 3 && pmi_rank == 0))
|
asprintf(&err, "RANK%d CALLED ABORT", pmi_rank);
|
||||||
&& i == 9995) {
|
fprintf(stderr, "%s\n", err);
|
||||||
PMI_Abort(rc, "RANK0 CALLED ABORT");
|
fflush(stderr);
|
||||||
}
|
PMI_Abort(rc, err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
if (NULL != err) {
|
if (NULL != err) {
|
||||||
fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err);
|
fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err);
|
||||||
rc = EXIT_FAILURE;
|
rc = EXIT_FAILURE;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user