1
1

Fix missing functionality in MPI_Abort so that the group of peers defined by the communicator that should be aborted with this process are requested from the runtime before the local process exits.

Per RFC:
  http://www.open-mpi.org/community/lists/devel/2011/06/9335.php

This commit was SVN r24775.
Этот коммит содержится в:
Josh Hursey 2011-06-15 13:10:13 +00:00
родитель 83154af74d
Коммит 0eb3b3b7b0
14 изменённых файлов: 231 добавлений и 21 удалений

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006 University of Houston. All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -31,6 +32,7 @@
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "ompi/errhandler/errhandler_predefined.h"
#include "ompi/errhandler/errcode.h"
#include "ompi/communicator/communicator.h"
@ -197,13 +199,15 @@ static void backend_fatal_aggregate(char *type,
"mpi_errors_are_fatal", false,
prefix, (NULL == arg) ? "" : "in",
(NULL == arg) ? "" : arg,
prefix, type, name, prefix, err_msg, prefix);
prefix, ORTE_PROC_MY_NAME->jobid, ORTE_PROC_MY_NAME->vpid,
prefix, type, name, prefix, err_msg, prefix, type, prefix);
} else if (NULL == name) {
orte_show_help("help-mpi-errors.txt",
"mpi_errors_are_fatal unknown handle", false,
prefix, (NULL == arg) ? "" : "in",
(NULL == arg) ? "" : arg,
prefix, type, prefix, err_msg, prefix);
prefix, ORTE_PROC_MY_NAME->jobid, ORTE_PROC_MY_NAME->vpid,
prefix, type, prefix, err_msg, prefix, type, prefix);
}
if (err_msg_need_free) {
@ -302,7 +306,10 @@ static void backend_fatal_no_aggregate(char *type,
out("*** Error code: %d (no associated error message)\n", intbuf);
}
}
out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL);
/* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
out("*** and potentially your MPI job)\n", NULL);
}
va_end(arglist);
}

Просмотреть файл

@ -11,6 +11,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -23,13 +24,17 @@
#
[mpi_errors_are_fatal]
%s *** An error occurred %s %s
%s *** reported by process [%lu,%lu]
%s *** on %s %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
%s *** and potentially your MPI job)
#
[mpi_errors_are_fatal unknown handle]
%s *** An error occurred %s %s
%s *** reported by process [%lu,%lu]
%s *** on a NULL %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
%s *** and potentially your MPI job)
#

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -52,7 +53,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
int errcode,
bool kill_remote_of_intercomm)
{
int count = 0, i;
int count = 0, i, ret;
char *msg, *host, hostname[MAXHOSTNAMELEN];
pid_t pid = 0;
orte_process_name_t *abort_procs;
@ -200,12 +201,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
if (nabort_procs > 0) {
#if 0
int ret = orte_errmgr.abort_procs_request(abort_procs, nabort_procs);
if (OMPI_SUCCESS != ret) {
orte_errmgr.abort(ret, "Open MPI failed to abort procs as requested (%d). Exiting.", ret);
/* This must be implemented for MPI_Abort() to work according to the
* standard language for a 'high-quality' implementation.
* It would be nifty if we could differentiate between the
* abort scenarios:
* - MPI_Abort()
* - MPI_ERRORS_ARE_FATAL
* - Victim of MPI_Abort()
*/
/*
* Abort peers in this communicator group. Does not include self.
*/
if( OMPI_SUCCESS != (ret = orte_errmgr.abort_peers(abort_procs, nabort_procs)) ) {
orte_errmgr.abort(ret, "Open MPI failed to abort all of the procs requested (%d).", ret);
}
#endif
}
/* now that we've aborted everyone else, gracefully die. */

Просмотреть файл

@ -1,8 +1,9 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,6 +28,9 @@
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
@ -44,6 +48,8 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
/******************
* HNP module
@ -53,6 +59,7 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_app_abort_peers,
update_state,
NULL,
NULL,
@ -112,3 +119,50 @@ static int update_state(orte_jobid_t job,
}
return ORTE_SUCCESS;
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_std_cntr_t i;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
/*
* Pack up the list of processes and send them to the HNP
*/
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* pack number of processes */
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* Pack the list of names */
for( i = 0; i < num_procs; ++i ) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
/* Send to HNP for termination */
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -256,6 +257,12 @@ void orte_errmgr_base_register_migration_warning(struct timeval *tv)
return;
}
int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
return ORTE_ERR_NOT_IMPLEMENTED;
}
/********************
* Utility functions
********************/

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -60,6 +61,7 @@ orte_errmgr_base_module_t orte_errmgr = {
NULL, /* finalize */
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_base_abort_peers,
orte_errmgr_base_update_state,
NULL, /* predicted_fault */
NULL, /* suggest_map_targets */

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -67,6 +68,8 @@ ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
__opal_attribute_format__(__printf__, 2, 3)
__opal_attribute_noreturn__;
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -172,6 +173,15 @@ typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename,
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
__opal_attribute_format_funcptr__(__printf__, 2, 3);
/**
* Alert - abort peers
* This function is called when a process wants to abort one or more peer processes.
* For example, MPI_Abort(comm) will use this function to terminate peers in the
* communicator group before aborting itself.
*/
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
/**
* Alert - process aborted
* This function is called by the PLM when a remote process aborts during execution. Actions taken
@ -254,6 +264,7 @@ struct orte_errmgr_base_module_2_3_0_t {
orte_errmgr_base_module_log_fn_t log;
orte_errmgr_base_module_abort_fn_t abort;
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
/** Actual process failure notification */
orte_errmgr_base_module_update_state_fn_t update_state;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -65,6 +65,8 @@ static orte_errmgr_base_module_t global_module = {
orte_errmgr_base_log,
/** Forced Abort */
orte_errmgr_base_abort,
/** Peer Force Abort */
orte_errmgr_base_abort_peers,
/** Update State */
orte_errmgr_hnp_global_update_state,
/* Predicted Fault */

Просмотреть файл

@ -1,8 +1,8 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -89,6 +89,7 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_base_abort_peers,
update_state,
predicted_fault,
suggest_map_targets,

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -279,7 +280,9 @@ int orte_ess_base_app_finalize(void)
orte_filem_base_close();
orte_wait_finalize();
orte_errmgr_base_close();
/* now can close the rml and its friendly group comm */
orte_grpcomm_base_close();
/* close the multicast */

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -862,22 +863,30 @@ static orte_proc_t* find_proc(orte_process_name_t *proc)
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
{
orte_proc_t *pdata;
if( NULL == proc ) {
return ORTE_VPID_INVALID;
}
if( ORTE_JOBID_IS_DAEMON(proc->jobid) ) {
return proc->vpid;
}
/* get the job data */
if (NULL == (pdata = find_proc(proc))) {
return ORTE_VPID_INVALID;
}
if (NULL == (pdata = find_proc(proc))) {
return ORTE_VPID_INVALID;
}
if( NULL == pdata->node->daemon ) {
return ORTE_VPID_INVALID;
}
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
"%s ess:hnp: proc %s is hosted by daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc),
ORTE_VPID_PRINT(pdata->node->daemon->name.vpid)));
return pdata->node->daemon->name.vpid;
}

Просмотреть файл

@ -8,6 +8,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -84,6 +85,8 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* process called "errmgr.abort" */
#define ORTE_DAEMON_ABORT_CALLED (orte_daemon_cmd_flag_t) 27
/* process called "errmgr.abort_procs" */
#define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28
/*
* List object to locally store the process names and pids of

Просмотреть файл

@ -13,6 +13,7 @@
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -85,6 +86,8 @@ static char *get_orted_comm_cmd_str(int command);
/* instantiate this - it is shared via orted.h */
struct timeval orte_daemon_msg_recvd;
static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL;
static struct timeval mesg_recvd={0,0};
static void send_relay(opal_buffer_t *buf)
@ -364,6 +367,10 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
opal_pointer_array_t procarray;
orte_proc_t *proct;
char *cmd_str = NULL;
opal_pointer_array_t *procs_to_kill = NULL;
orte_std_cntr_t num_procs, num_new_procs = 0, p;
orte_proc_t *cur_proc = NULL, *prev_proc = NULL;
bool found = false;
/* unpack the command */
n = 1;
@ -498,6 +505,89 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
orte_odls_base_default_report_abort(sender);
break;
case ORTE_DAEMON_ABORT_PROCS_CALLED:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received abort_procs report",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* Number of processes */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_procs, &n, ORTE_STD_CNTR)) ) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* Retrieve list of processes */
procs_to_kill = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_init(procs_to_kill, num_procs, INT32_MAX, 2);
/* Keep track of previously terminated, so we don't keep ordering the
* same processes to die.
*/
if( NULL == procs_prev_ordered_to_terminate ) {
procs_prev_ordered_to_terminate = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_init(procs_prev_ordered_to_terminate, num_procs+1, INT32_MAX, 8);
}
num_new_procs = 0;
for( i = 0; i < num_procs; ++i) {
cur_proc = OBJ_NEW(orte_proc_t);
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(cur_proc->name), &n, ORTE_NAME)) ) {
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* See if duplicate */
found = false;
for( p = 0; p < procs_prev_ordered_to_terminate->size; ++p) {
if( NULL == (prev_proc = (orte_proc_t*)opal_pointer_array_get_item(procs_prev_ordered_to_terminate, p))) {
continue;
}
if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
&cur_proc->name,
&prev_proc->name) ) {
found = true;
break;
}
}
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s orted:comm:abort_procs Application %s requests term. of %s (%2d of %2d) %3s.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender),
ORTE_NAME_PRINT(&(cur_proc->name)), i, num_procs,
(found ? "Dup" : "New") ));
/* If not a duplicate, then add to the to_kill list */
if( !found ) {
opal_pointer_array_add(procs_to_kill, (void*)cur_proc);
OBJ_RETAIN(cur_proc);
opal_pointer_array_add(procs_prev_ordered_to_terminate, (void*)cur_proc);
num_new_procs++;
}
}
/*
* Send the request to termiante
*/
if( num_new_procs > 0 ) {
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s orted:comm:abort_procs Terminating application requested processes (%2d / %2d).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_new_procs, num_procs));
orte_plm.terminate_procs(procs_to_kill);
} else {
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
"%s orted:comm:abort_procs No new application processes to terminating from request (%2d / %2d).",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
num_new_procs, num_procs));
}
break;
/**** TREE_SPAWN ****/
case ORTE_DAEMON_TREE_SPAWN:
if (orte_debug_daemons_flag) {
@ -1270,6 +1360,10 @@ static char *get_orted_comm_cmd_str(int command)
return strdup("ORTE_DAEMON_SYNC_WANT_NIDMAP");
case ORTE_DAEMON_TOP_CMD:
return strdup("ORTE_DAEMON_TOP_CMD");
case ORTE_DAEMON_ABORT_CALLED:
return strdup("ORTE_DAEMON_ABORT_CALLED");
case ORTE_DAEMON_ABORT_PROCS_CALLED:
return strdup("ORTE_DAEMON_ABORT_PROCS_CALLED");
default:
return strdup("Unknown Command!");
}