Fix missing functionality in MPI_Abort so that the group of peers defined by the communicator that should be aborted with this process are requested from the runtime before the local process exits.
Per RFC: http://www.open-mpi.org/community/lists/devel/2011/06/9335.php This commit was SVN r24775.
Этот коммит содержится в:
родитель
83154af74d
Коммит
0eb3b3b7b0
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -31,6 +32,7 @@
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "ompi/errhandler/errhandler_predefined.h"
|
||||
#include "ompi/errhandler/errcode.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
@ -197,13 +199,15 @@ static void backend_fatal_aggregate(char *type,
|
||||
"mpi_errors_are_fatal", false,
|
||||
prefix, (NULL == arg) ? "" : "in",
|
||||
(NULL == arg) ? "" : arg,
|
||||
prefix, type, name, prefix, err_msg, prefix);
|
||||
prefix, ORTE_PROC_MY_NAME->jobid, ORTE_PROC_MY_NAME->vpid,
|
||||
prefix, type, name, prefix, err_msg, prefix, type, prefix);
|
||||
} else if (NULL == name) {
|
||||
orte_show_help("help-mpi-errors.txt",
|
||||
"mpi_errors_are_fatal unknown handle", false,
|
||||
prefix, (NULL == arg) ? "" : "in",
|
||||
(NULL == arg) ? "" : arg,
|
||||
prefix, type, prefix, err_msg, prefix);
|
||||
prefix, ORTE_PROC_MY_NAME->jobid, ORTE_PROC_MY_NAME->vpid,
|
||||
prefix, type, prefix, err_msg, prefix, type, prefix);
|
||||
}
|
||||
|
||||
if (err_msg_need_free) {
|
||||
@ -302,7 +306,10 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
out("*** Error code: %d (no associated error message)\n", intbuf);
|
||||
}
|
||||
}
|
||||
out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL);
|
||||
/* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
|
||||
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
|
||||
out("*** and potentially your MPI job)\n", NULL);
|
||||
|
||||
}
|
||||
va_end(arglist);
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -23,13 +24,17 @@
|
||||
#
|
||||
[mpi_errors_are_fatal]
|
||||
%s *** An error occurred %s %s
|
||||
%s *** reported by process [%lu,%lu]
|
||||
%s *** on %s %s
|
||||
%s *** %s
|
||||
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
|
||||
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
|
||||
%s *** and potentially your MPI job)
|
||||
#
|
||||
[mpi_errors_are_fatal unknown handle]
|
||||
%s *** An error occurred %s %s
|
||||
%s *** reported by process [%lu,%lu]
|
||||
%s *** on a NULL %s
|
||||
%s *** %s
|
||||
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
|
||||
%s *** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,
|
||||
%s *** and potentially your MPI job)
|
||||
#
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -52,7 +53,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
int errcode,
|
||||
bool kill_remote_of_intercomm)
|
||||
{
|
||||
int count = 0, i;
|
||||
int count = 0, i, ret;
|
||||
char *msg, *host, hostname[MAXHOSTNAMELEN];
|
||||
pid_t pid = 0;
|
||||
orte_process_name_t *abort_procs;
|
||||
@ -200,12 +201,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
|
||||
if (nabort_procs > 0) {
|
||||
#if 0
|
||||
int ret = orte_errmgr.abort_procs_request(abort_procs, nabort_procs);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
orte_errmgr.abort(ret, "Open MPI failed to abort procs as requested (%d). Exiting.", ret);
|
||||
/* This must be implemented for MPI_Abort() to work according to the
|
||||
* standard language for a 'high-quality' implementation.
|
||||
* It would be nifty if we could differentiate between the
|
||||
* abort scenarios:
|
||||
* - MPI_Abort()
|
||||
* - MPI_ERRORS_ARE_FATAL
|
||||
* - Victim of MPI_Abort()
|
||||
*/
|
||||
/*
|
||||
* Abort peers in this communicator group. Does not include self.
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = orte_errmgr.abort_peers(abort_procs, nabort_procs)) ) {
|
||||
orte_errmgr.abort(ret, "Open MPI failed to abort all of the procs requested (%d).", ret);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* now that we've aborted everyone else, gracefully die. */
|
||||
|
@ -1,8 +1,9 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,6 +28,9 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
@ -44,6 +48,8 @@ static int update_state(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
@ -53,6 +59,7 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_app_abort_peers,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
@ -112,3 +119,50 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t buffer;
|
||||
orte_std_cntr_t i;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
|
||||
|
||||
/*
|
||||
* Pack up the list of processes and send them to the HNP
|
||||
*/
|
||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* pack number of processes */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Pack the list of names */
|
||||
for( i = 0; i < num_procs; ++i ) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* Send to HNP for termination */
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -256,6 +257,12 @@ void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
||||
return;
|
||||
}
|
||||
|
||||
int orte_errmgr_base_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -60,6 +61,7 @@ orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL, /* finalize */
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
orte_errmgr_base_update_state,
|
||||
NULL, /* predicted_fault */
|
||||
NULL, /* suggest_map_targets */
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -67,6 +68,8 @@ ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line
|
||||
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
__opal_attribute_noreturn__;
|
||||
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -172,6 +173,15 @@ typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename,
|
||||
typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
|
||||
__opal_attribute_format_funcptr__(__printf__, 2, 3);
|
||||
|
||||
/**
|
||||
* Alert - abort peers
|
||||
* This function is called when a process wants to abort one or more peer processes.
|
||||
* For example, MPI_Abort(comm) will use this function to terminate peers in the
|
||||
* communicator group before aborting itself.
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
/**
|
||||
* Alert - process aborted
|
||||
* This function is called by the PLM when a remote process aborts during execution. Actions taken
|
||||
@ -254,6 +264,7 @@ struct orte_errmgr_base_module_2_3_0_t {
|
||||
|
||||
orte_errmgr_base_module_log_fn_t log;
|
||||
orte_errmgr_base_module_abort_fn_t abort;
|
||||
orte_errmgr_base_module_abort_peers_fn_t abort_peers;
|
||||
|
||||
/** Actual process failure notification */
|
||||
orte_errmgr_base_module_update_state_fn_t update_state;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -65,6 +65,8 @@ static orte_errmgr_base_module_t global_module = {
|
||||
orte_errmgr_base_log,
|
||||
/** Forced Abort */
|
||||
orte_errmgr_base_abort,
|
||||
/** Peer Force Abort */
|
||||
orte_errmgr_base_abort_peers,
|
||||
/** Update State */
|
||||
orte_errmgr_hnp_global_update_state,
|
||||
/* Predicted Fault */
|
||||
|
@ -1,8 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -89,6 +89,7 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
update_state,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -279,7 +280,9 @@ int orte_ess_base_app_finalize(void)
|
||||
orte_filem_base_close();
|
||||
|
||||
orte_wait_finalize();
|
||||
|
||||
|
||||
orte_errmgr_base_close();
|
||||
|
||||
/* now can close the rml and its friendly group comm */
|
||||
orte_grpcomm_base_close();
|
||||
/* close the multicast */
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -862,22 +863,30 @@ static orte_proc_t* find_proc(orte_process_name_t *proc)
|
||||
static orte_vpid_t proc_get_daemon(orte_process_name_t *proc)
|
||||
{
|
||||
orte_proc_t *pdata;
|
||||
|
||||
|
||||
if( NULL == proc ) {
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
if( ORTE_JOBID_IS_DAEMON(proc->jobid) ) {
|
||||
return proc->vpid;
|
||||
}
|
||||
|
||||
/* get the job data */
|
||||
if (NULL == (pdata = find_proc(proc))) {
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
if (NULL == (pdata = find_proc(proc))) {
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
if( NULL == pdata->node->daemon ) {
|
||||
return ORTE_VPID_INVALID;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
|
||||
"%s ess:hnp: proc %s is hosted by daemon %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc),
|
||||
ORTE_VPID_PRINT(pdata->node->daemon->name.vpid)));
|
||||
|
||||
|
||||
return pdata->node->daemon->name.vpid;
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -84,6 +85,8 @@ typedef uint8_t orte_daemon_cmd_flag_t;
|
||||
|
||||
/* process called "errmgr.abort" */
|
||||
#define ORTE_DAEMON_ABORT_CALLED (orte_daemon_cmd_flag_t) 27
|
||||
/* process called "errmgr.abort_procs" */
|
||||
#define ORTE_DAEMON_ABORT_PROCS_CALLED (orte_daemon_cmd_flag_t) 28
|
||||
|
||||
/*
|
||||
* List object to locally store the process names and pids of
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -85,6 +86,8 @@ static char *get_orted_comm_cmd_str(int command);
|
||||
/* instantiate this - it is shared via orted.h */
|
||||
struct timeval orte_daemon_msg_recvd;
|
||||
|
||||
static opal_pointer_array_t *procs_prev_ordered_to_terminate = NULL;
|
||||
|
||||
static struct timeval mesg_recvd={0,0};
|
||||
|
||||
static void send_relay(opal_buffer_t *buf)
|
||||
@ -364,6 +367,10 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
opal_pointer_array_t procarray;
|
||||
orte_proc_t *proct;
|
||||
char *cmd_str = NULL;
|
||||
opal_pointer_array_t *procs_to_kill = NULL;
|
||||
orte_std_cntr_t num_procs, num_new_procs = 0, p;
|
||||
orte_proc_t *cur_proc = NULL, *prev_proc = NULL;
|
||||
bool found = false;
|
||||
|
||||
/* unpack the command */
|
||||
n = 1;
|
||||
@ -498,6 +505,89 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
||||
orte_odls_base_default_report_abort(sender);
|
||||
break;
|
||||
|
||||
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s orted_cmd: received abort_procs report",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
|
||||
/* Number of processes */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_procs, &n, ORTE_STD_CNTR)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* Retrieve list of processes */
|
||||
procs_to_kill = OBJ_NEW(opal_pointer_array_t);
|
||||
opal_pointer_array_init(procs_to_kill, num_procs, INT32_MAX, 2);
|
||||
|
||||
/* Keep track of previously terminated, so we don't keep ordering the
|
||||
* same processes to die.
|
||||
*/
|
||||
if( NULL == procs_prev_ordered_to_terminate ) {
|
||||
procs_prev_ordered_to_terminate = OBJ_NEW(opal_pointer_array_t);
|
||||
opal_pointer_array_init(procs_prev_ordered_to_terminate, num_procs+1, INT32_MAX, 8);
|
||||
}
|
||||
|
||||
num_new_procs = 0;
|
||||
for( i = 0; i < num_procs; ++i) {
|
||||
cur_proc = OBJ_NEW(orte_proc_t);
|
||||
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(cur_proc->name), &n, ORTE_NAME)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* See if duplicate */
|
||||
found = false;
|
||||
for( p = 0; p < procs_prev_ordered_to_terminate->size; ++p) {
|
||||
if( NULL == (prev_proc = (orte_proc_t*)opal_pointer_array_get_item(procs_prev_ordered_to_terminate, p))) {
|
||||
continue;
|
||||
}
|
||||
if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
||||
&cur_proc->name,
|
||||
&prev_proc->name) ) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s orted:comm:abort_procs Application %s requests term. of %s (%2d of %2d) %3s.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender),
|
||||
ORTE_NAME_PRINT(&(cur_proc->name)), i, num_procs,
|
||||
(found ? "Dup" : "New") ));
|
||||
|
||||
/* If not a duplicate, then add to the to_kill list */
|
||||
if( !found ) {
|
||||
opal_pointer_array_add(procs_to_kill, (void*)cur_proc);
|
||||
OBJ_RETAIN(cur_proc);
|
||||
opal_pointer_array_add(procs_prev_ordered_to_terminate, (void*)cur_proc);
|
||||
num_new_procs++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Send the request to termiante
|
||||
*/
|
||||
if( num_new_procs > 0 ) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s orted:comm:abort_procs Terminating application requested processes (%2d / %2d).",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
num_new_procs, num_procs));
|
||||
orte_plm.terminate_procs(procs_to_kill);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
|
||||
"%s orted:comm:abort_procs No new application processes to terminating from request (%2d / %2d).",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
num_new_procs, num_procs));
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
/**** TREE_SPAWN ****/
|
||||
case ORTE_DAEMON_TREE_SPAWN:
|
||||
if (orte_debug_daemons_flag) {
|
||||
@ -1270,6 +1360,10 @@ static char *get_orted_comm_cmd_str(int command)
|
||||
return strdup("ORTE_DAEMON_SYNC_WANT_NIDMAP");
|
||||
case ORTE_DAEMON_TOP_CMD:
|
||||
return strdup("ORTE_DAEMON_TOP_CMD");
|
||||
case ORTE_DAEMON_ABORT_CALLED:
|
||||
return strdup("ORTE_DAEMON_ABORT_CALLED");
|
||||
case ORTE_DAEMON_ABORT_PROCS_CALLED:
|
||||
return strdup("ORTE_DAEMON_ABORT_PROCS_CALLED");
|
||||
default:
|
||||
return strdup("Unknown Command!");
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user