Sorry for mid-day commit, but I had promised on the call to do this upon my return.
Roll in the ORTE state machine. Remove last traces of opal_sos. Remove UTK epoch code. Please see the various emails about the state machine change for details. I'll send something out later with more info on the new arch. This commit was SVN r26242.
Этот коммит содержится в:
родитель
55d9c71ce0
Коммит
bd8b4f7f1e
@ -32,6 +32,8 @@ my @globals = qw/.libs
|
||||
*.orig
|
||||
*.rej
|
||||
*.class
|
||||
*.xcscheme
|
||||
*.plist
|
||||
.git*
|
||||
.DS_Store
|
||||
stamp-h[1-9]
|
||||
|
28
contrib/platform/iu/odin/debug-nopmi
Обычный файл
28
contrib/platform/iu/odin/debug-nopmi
Обычный файл
@ -0,0 +1,28 @@
|
||||
enable_opal_multi_threads=no
|
||||
enable_dlopen=no
|
||||
enable_pty_support=no
|
||||
with_blcr=no
|
||||
with_openib=no
|
||||
with_memory_manager=no
|
||||
enable_mem_debug=yes
|
||||
enable_mem_profile=no
|
||||
enable_debug_symbols=yes
|
||||
enable_binaries=yes
|
||||
with_devel_headers=yes
|
||||
enable_heterogeneous=no
|
||||
enable_picky=yes
|
||||
enable_debug=yes
|
||||
enable_shared=yes
|
||||
enable_static=no
|
||||
with_slurm=yes
|
||||
with_pmi=no
|
||||
enable_contrib_no_build=libnbc,vt
|
||||
enable_visibility=yes
|
||||
enable_memchecker=no
|
||||
enable_ipv6=no
|
||||
enable_mpi_f77=no
|
||||
enable_mpi_f90=no
|
||||
enable_mpi_cxx=no
|
||||
enable_mpi_cxx_seek=no
|
||||
enable_mca_no_build=pml-dr,pml-crcp2,crcp
|
||||
enable_io_romio=no
|
85
contrib/platform/iu/odin/debug-nopmi.conf
Обычный файл
85
contrib/platform/iu/odin/debug-nopmi.conf
Обычный файл
@ -0,0 +1,85 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# This is the default system-wide MCA parameters defaults file.
|
||||
# Specifically, the MCA parameter "mca_param_files" defaults to a
|
||||
# value of
|
||||
# "$HOME/.openmpi/mca-params.conf:$sysconf/openmpi-mca-params.conf"
|
||||
# (this file is the latter of the two). So if the default value of
|
||||
# mca_param_files is not changed, this file is used to set system-wide
|
||||
# MCA parameters. This file can therefore be used to set system-wide
|
||||
# default MCA parameters for all users. Of course, users can override
|
||||
# these values if they want, but this file is an excellent location
|
||||
# for setting system-specific MCA parameters for those users who don't
|
||||
# know / care enough to investigate the proper values for them.
|
||||
|
||||
# Note that this file is only applicable where it is visible (in a
|
||||
# filesystem sense). Specifically, MPI processes each read this file
|
||||
# during their startup to determine what default values for MCA
|
||||
# parameters should be used. mpirun does not bundle up the values in
|
||||
# this file from the node where it was run and send them to all nodes;
|
||||
# the default value decisions are effectively distributed. Hence,
|
||||
# these values are only applicable on nodes that "see" this file. If
|
||||
# $sysconf is a directory on a local disk, it is likely that changes
|
||||
# to this file will need to be propagated to other nodes. If $sysconf
|
||||
# is a directory that is shared via a networked filesystem, changes to
|
||||
# this file will be visible to all nodes that share this $sysconf.
|
||||
|
||||
# The format is straightforward: one per line, mca_param_name =
|
||||
# rvalue. Quoting is ignored (so if you use quotes or escape
|
||||
# characters, they'll be included as part of the value). For example:
|
||||
|
||||
# Disable run-time MPI parameter checking
|
||||
# mpi_param_check = 0
|
||||
|
||||
# Note that the value "~/" will be expanded to the current user's home
|
||||
# directory. For example:
|
||||
|
||||
# Change component loading path
|
||||
# component_path = /usr/local/lib/openmpi:~/my_openmpi_components
|
||||
|
||||
# See "ompi_info --param all all" for a full listing of Open MPI MCA
|
||||
# parameters available and their default values.
|
||||
#
|
||||
|
||||
# Basic behavior to smooth startup
|
||||
mca_component_show_load_errors = 0
|
||||
mpi_param_check = 0
|
||||
orte_abort_timeout = 10
|
||||
hwloc_base_mem_bind_failure_action = silent
|
||||
|
||||
## Protect the shared file systems
|
||||
|
||||
## Add the interface for out-of-band communication
|
||||
## and set it up
|
||||
oob_tcp_peer_retries = 120
|
||||
oob_tcp_disable_family = IPv6
|
||||
#oob_tcp_connect_timeout=600
|
||||
|
||||
## Define the MPI interconnects
|
||||
btl = sm,tcp,self
|
||||
|
||||
## Setup shared memory
|
||||
btl_sm_free_list_max = 768
|
||||
|
||||
## Setup TCP
|
||||
btl_tcp_if_include = ib0
|
||||
|
||||
## Configure the PML
|
||||
pml_ob1_use_early_completion = 0
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -195,7 +197,6 @@
|
||||
#include "ompi/attribute/attribute.h"
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
@ -1151,7 +1152,7 @@ static int get_value(opal_hash_table_t *attr_hash, int key,
|
||||
(void**) &keyval);
|
||||
OPAL_THREAD_UNLOCK(&keyval_hash_lock);
|
||||
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) {
|
||||
return MPI_KEYVAL_INVALID;
|
||||
}
|
||||
|
||||
|
@ -14,6 +14,8 @@
|
||||
* Copyright (c) 2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -32,7 +34,6 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/request/request.h"
|
||||
@ -145,7 +146,7 @@ int ompi_comm_cid_init (void)
|
||||
ompi_comm_world_thread_level_mult = 1;
|
||||
break;
|
||||
}
|
||||
} else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
} else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
|
||||
if (ompi_mpi_thread_multiple) {
|
||||
ompi_comm_world_thread_level_mult = 1;
|
||||
}
|
||||
|
@ -32,6 +32,9 @@
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#ifdef HAVE_LIMITS_H
|
||||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
@ -12,6 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,7 +31,6 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#define OMPI_MAX_ERROR_STRING 64
|
||||
|
||||
@ -51,18 +52,13 @@ OMPI_DECLSPEC extern opal_pointer_array_t ompi_errcodes_intern;
|
||||
OMPI_DECLSPEC extern int ompi_errcode_intern_lastused;
|
||||
|
||||
/**
|
||||
* Return the MPI errcode for a given internal error code. This
|
||||
* function guarantees to return a non-OPAL_SOS-encoded error code.
|
||||
*/
|
||||
* Return the MPI errcode for a given internal error code. */
|
||||
static inline int ompi_errcode_get_mpi_code(int errcode)
|
||||
{
|
||||
int ret = MPI_ERR_UNKNOWN;
|
||||
int i;
|
||||
ompi_errcode_intern_t *errc;
|
||||
|
||||
/* Transmogrify, if necessary */
|
||||
errcode = OPAL_SOS_GET_ERROR_CODE(errcode);
|
||||
|
||||
/* If the errcode is >= 0, then it's already an MPI error code, so
|
||||
just return it. */
|
||||
if (errcode >= 0) {
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -31,7 +33,6 @@
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
|
||||
@ -273,7 +274,7 @@ static inline int mca_bml_base_send( mca_bml_base_btl_t* bml_btl,
|
||||
|
||||
des->des_context = (void*) bml_btl;
|
||||
rc = btl->btl_send(btl, bml_btl->btl_endpoint, des, tag);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_RESOURCE_BUSY)
|
||||
if (rc == OMPI_ERR_RESOURCE_BUSY)
|
||||
rc = OMPI_SUCCESS;
|
||||
|
||||
return rc;
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -404,7 +404,7 @@ static int mca_bml_r2_add_procs( size_t nprocs,
|
||||
}
|
||||
|
||||
if (mca_bml_r2.show_unreach_errors &&
|
||||
OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_UNREACH == ret) {
|
||||
orte_show_help("help-mca-bml-r2.txt",
|
||||
"unreachable proc",
|
||||
true,
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -25,12 +25,16 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "ompi/mca/bml/base/bml_base_btl.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#include "bml_r2.h"
|
||||
@ -47,6 +51,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
int loc_state;
|
||||
int param_type = -1;
|
||||
char *param_list = NULL;
|
||||
orte_grpcomm_collective_t coll;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Do nothing for now */
|
||||
@ -153,10 +158,15 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||
coll.id = orte_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
@ -226,10 +236,15 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||
coll.id = orte_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
|
@ -12,6 +12,8 @@
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -34,7 +36,6 @@
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "opal/mca/timer/base/base.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -34,7 +34,6 @@
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
@ -303,7 +302,7 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
|
||||
|
||||
/* Check if our device supports modify srq ability */
|
||||
rc = check_if_device_support_modify_srq(openib_btl);
|
||||
if(OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_NOT_SUPPORTED == rc) {
|
||||
device_support_modify_srq = false;
|
||||
} else if(OMPI_SUCCESS != rc) {
|
||||
mca_btl_openib_show_init_error(__FILE__, __LINE__,
|
||||
@ -494,7 +493,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
endpoint->rem_info.rem_vendor_part_id, &values);
|
||||
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"error in device init", true,
|
||||
orte_process_info.nodename,
|
||||
@ -1625,7 +1624,7 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
@ -1696,7 +1695,7 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -1125,8 +1125,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device)
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
/* If we're "out of memory", this usually means that we ran
|
||||
out of registered memory, so show that error message */
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) {
|
||||
errno = ENOMEM;
|
||||
mca_btl_openib_show_init_error(__FILE__, __LINE__,
|
||||
"ompi_free_list_init_ex_new",
|
||||
@ -1161,8 +1161,8 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device)
|
||||
/* If we're "out of memory", this usually means that we
|
||||
ran out of registered memory, so show that error
|
||||
message */
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) {
|
||||
errno = ENOMEM;
|
||||
mca_btl_openib_show_init_error(__FILE__, __LINE__,
|
||||
"ompi_free_list_init_ex_new",
|
||||
@ -1658,11 +1658,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
device->ib_dev_attr.vendor_part_id,
|
||||
&values);
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* If we get a serious error, propagate it upwards */
|
||||
goto error;
|
||||
}
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) {
|
||||
/* If we didn't find a matching device in the INI files, output a
|
||||
warning that we're using default values (unless overridden
|
||||
that we don't want to see these warnings) */
|
||||
@ -1679,7 +1679,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
be set indicating that it does not have good values */
|
||||
ret = ompi_btl_openib_ini_query(0, 0, &default_values);
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* If we get a serious error, propagate it upwards */
|
||||
goto error;
|
||||
}
|
||||
@ -1841,7 +1841,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
device, &mpool_resources);
|
||||
if(NULL == device->mpool){
|
||||
/* Don't print an error message here -- we'll get one from
|
||||
mpool_create anyway (OPAL_SOS would be good here...) */
|
||||
mpool_create anyway */
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -1899,7 +1899,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* Out of bounds error indicates that we hit max btl number
|
||||
* don't propagate the error to the caller */
|
||||
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
|
||||
ret = OMPI_SUCCESS;
|
||||
}
|
||||
break;
|
||||
@ -2830,7 +2830,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable openib modules or not. */
|
||||
if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OMPI_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
@ -2994,7 +2994,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep)
|
||||
error upward. */
|
||||
rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc &&
|
||||
OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) {
|
||||
OMPI_ERR_RESOURCE_BUSY != rc)) {
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return rc;
|
||||
}
|
||||
@ -3023,7 +3023,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
break;
|
||||
rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
|
||||
@ -3036,7 +3036,7 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
break;
|
||||
rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||
@ -36,7 +36,6 @@
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
@ -714,7 +713,7 @@ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep,
|
||||
rc = mca_btl_openib_endpoint_post_send(ep, frag);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) {
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) {
|
||||
rc = OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -898,7 +897,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
|
||||
));
|
||||
}
|
||||
rc = mca_btl_openib_endpoint_send(endpoint, frag);
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
|
@ -1,6 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -28,8 +30,6 @@
|
||||
#include "btl_openib_proc.h"
|
||||
#include "btl_openib_failover.h"
|
||||
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_base_module_t* module,
|
||||
bool errout);
|
||||
@ -691,7 +691,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui
|
||||
BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
|
||||
}
|
||||
rc = mca_btl_openib_endpoint_send(newep, frag);
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,7 +31,6 @@
|
||||
#endif
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "btl_openib.h"
|
||||
@ -133,13 +134,13 @@ int ompi_btl_openib_ini_init(void)
|
||||
/* Note that NOT_FOUND and SUCCESS are not fatal errors
|
||||
and we keep going. Other errors are treated as
|
||||
fatal */
|
||||
if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) {
|
||||
if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
str = colon + 1;
|
||||
}
|
||||
/* Parse the last file if we didn't have a fatal error above */
|
||||
if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) {
|
||||
if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) {
|
||||
ret = parse_file(str);
|
||||
}
|
||||
|
||||
@ -150,7 +151,7 @@ int ompi_btl_openib_ini_init(void)
|
||||
/* Return SUCCESS unless we got a fatal error */
|
||||
|
||||
initialized = true;
|
||||
return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ?
|
||||
return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ?
|
||||
OMPI_SUCCESS : ret;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -28,7 +30,6 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
/*
|
||||
* Array of all possible connection functions
|
||||
@ -219,7 +220,7 @@ int ompi_btl_openib_connect_base_init(void)
|
||||
opal_output(-1, "found available cpc (SUCCESS init): %s",
|
||||
all[i]->cbc_name);
|
||||
continue;
|
||||
} else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
} else if (OMPI_ERR_NOT_SUPPORTED == rc) {
|
||||
continue;
|
||||
} else {
|
||||
return rc;
|
||||
@ -265,8 +266,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *
|
||||
strcat(msg, available[i]->cbc_name);
|
||||
|
||||
rc = available[i]->cbc_query(btl, &cpcs[cpc_index]);
|
||||
if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_NOT_SUPPORTED == rc || OMPI_ERR_UNREACH == rc) {
|
||||
continue;
|
||||
} else if (OMPI_SUCCESS != rc) {
|
||||
free(cpcs);
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2009-2011 IBM Corporation. All rights reserved.
|
||||
@ -30,7 +30,6 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -4,6 +4,8 @@
|
||||
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -46,7 +48,6 @@
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
#include "btl_openib_fd.h"
|
||||
@ -1932,7 +1933,7 @@ out3:
|
||||
out1:
|
||||
free(*cpc);
|
||||
out:
|
||||
if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_NOT_SUPPORTED == rc) {
|
||||
opal_output_verbose(5, mca_btl_base_output,
|
||||
"openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped",
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
|
@ -5,6 +5,8 @@
|
||||
* Copyright (c) 2010-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -21,7 +23,6 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -698,10 +699,8 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
|
||||
|
||||
BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
|
||||
"jobid %d, vpid %d, "
|
||||
"epoch %d, "
|
||||
"sid %" PRIx64 ", lid %d",
|
||||
process_name->jobid, process_name->vpid,
|
||||
ORTE_EPOCH_GET(process_name),
|
||||
subnet_id, lid));
|
||||
|
||||
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,7 +31,6 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "btl_portals.h"
|
||||
#include "btl_portals_endpoint.h"
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -67,7 +69,7 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_recv_t);
|
||||
ompi_free_list_item_t *item; \
|
||||
OMPI_FREE_LIST_GET(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \
|
||||
frag = (mca_btl_portals_frag_t*) item; \
|
||||
if (OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \
|
||||
if (rc == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \
|
||||
OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc); \
|
||||
} \
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,7 +26,6 @@
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "btl_portals.h"
|
||||
#include "btl_portals_send.h"
|
||||
|
@ -12,6 +12,8 @@
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Laboratory
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,7 +55,6 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "orte/types.h"
|
||||
@ -1055,7 +1056,7 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules,
|
||||
}
|
||||
#if OPAL_WANT_IPV6
|
||||
if((ret = mca_btl_tcp_component_create_listen(AF_INET6)) != OMPI_SUCCESS) {
|
||||
if (!(OMPI_ERR_IN_ERRNO == OPAL_SOS_GET_ERROR_CODE(ret) &&
|
||||
if (!(OMPI_ERR_IN_ERRNO == ret &&
|
||||
EAFNOSUPPORT == opal_socket_errno)) {
|
||||
opal_output (0, "mca_btl_tcp_component: IPv6 listening socket failed\n");
|
||||
return 0;
|
||||
|
@ -13,7 +13,6 @@
|
||||
#include "opal/include/opal_stdint.h"
|
||||
|
||||
#include "btl_ugni_rdma.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
/**
|
||||
* Initiate a put operation.
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -32,7 +32,6 @@
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
@ -309,7 +308,7 @@ static int mca_btl_wv_tune_endpoint(mca_btl_wv_module_t* wv_btl,
|
||||
endpoint->rem_info.rem_vendor_part_id, &values);
|
||||
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
orte_show_help("help-mpi-btl-wv.txt",
|
||||
"error in device init", true,
|
||||
orte_process_info.nodename,
|
||||
@ -1347,7 +1346,7 @@ int mca_btl_wv_put(mca_btl_base_module_t* btl,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
@ -1406,7 +1405,7 @@ int mca_btl_wv_get(mca_btl_base_module_t* btl,
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags);
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -784,8 +784,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device)
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
/* If we're "out of memory", this usually means that we ran
|
||||
out of registered memory, so show that error message */
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) {
|
||||
errno = ENOMEM;
|
||||
mca_btl_wv_show_init_error(__FILE__, __LINE__,
|
||||
"ompi_free_list_init_ex_new",
|
||||
@ -820,8 +820,8 @@ static int prepare_device_for_use(mca_btl_wv_device_t *device)
|
||||
/* If we're "out of memory", this usually means that we
|
||||
ran out of registered memory, so show that error
|
||||
message */
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc ||
|
||||
OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) {
|
||||
errno = ENOMEM;
|
||||
mca_btl_wv_show_init_error(__FILE__, __LINE__,
|
||||
"ompi_free_list_init_ex_new",
|
||||
@ -1312,11 +1312,11 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev)
|
||||
device->ib_dev_attr.VendorPartId,
|
||||
&values);
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* If we get a serious error, propagate it upwards */
|
||||
goto error;
|
||||
}
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) {
|
||||
/* If we didn't find a matching device in the INI files, output a
|
||||
warning that we're using default values (unless overridden
|
||||
that we don't want to see these warnings) */
|
||||
@ -1333,7 +1333,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev)
|
||||
be set indicating that it does not have good values */
|
||||
ret = ompi_btl_wv_ini_query(0, 0, &default_values);
|
||||
if (OMPI_SUCCESS != ret &&
|
||||
OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
OMPI_ERR_NOT_FOUND != ret) {
|
||||
/* If we get a serious error, propagate it upwards */
|
||||
goto error;
|
||||
}
|
||||
@ -1429,7 +1429,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev)
|
||||
device, &mpool_resources);
|
||||
if(NULL == device->mpool){
|
||||
/* Don't print an error message here -- we'll get one from
|
||||
mpool_create anyway (OPAL_SOS would be good here...) */
|
||||
mpool_create anyway */
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -1481,7 +1481,7 @@ static int init_one_device(opal_list_t *btl_list, struct wv_device* ib_dev)
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
/* Out of bounds error indicates that we hit max btl number
|
||||
* don't propagate the error to the caller */
|
||||
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
|
||||
ret = OMPI_SUCCESS;
|
||||
}
|
||||
break;
|
||||
@ -2313,7 +2313,7 @@ btl_wv_component_init(int *num_btl_modules,
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable wv modules or not. */
|
||||
if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OMPI_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
@ -2469,7 +2469,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep)
|
||||
error upward. */
|
||||
rc = mca_btl_wv_endpoint_post_send(ep, to_send_frag(frag));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc &&
|
||||
OMPI_ERR_RESOURCE_BUSY != OPAL_SOS_GET_ERROR_CODE(rc))) {
|
||||
OMPI_ERR_RESOURCE_BUSY != rc)) {
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
return rc;
|
||||
}
|
||||
@ -2497,7 +2497,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
break;
|
||||
rc = mca_btl_wv_get((mca_btl_base_module_t *)wv_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2510,7 +2510,7 @@ void mca_btl_wv_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
||||
break;
|
||||
rc = mca_btl_wv_put((mca_btl_base_module_t *)wv_btl, ep,
|
||||
&to_base_frag(frag)->base);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||
@ -33,7 +33,6 @@
|
||||
|
||||
#include "opal_stdint.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
@ -617,7 +616,7 @@ int mca_btl_wv_endpoint_send(mca_btl_base_endpoint_t* ep,
|
||||
rc = mca_btl_wv_endpoint_post_send(ep, frag);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))) {
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_RESOURCE_BUSY == rc)) {
|
||||
rc = OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -801,7 +800,7 @@ static int mca_btl_wv_endpoint_send_eager_rdma(
|
||||
));
|
||||
}
|
||||
rc = mca_btl_wv_endpoint_send(endpoint, frag);
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if (OMPI_SUCCESS == rc || OMPI_ERR_RESOURCE_BUSY == rc)
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,7 +31,6 @@
|
||||
#endif
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
|
||||
#include "btl_wv.h"
|
||||
@ -127,13 +128,13 @@ int ompi_btl_wv_ini_init(void)
|
||||
/* Note that NOT_FOUND and SUCCESS are not fatal errors
|
||||
and we keep going. Other errors are treated as
|
||||
fatal */
|
||||
if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) {
|
||||
if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) {
|
||||
break;
|
||||
}
|
||||
str = colon + 1;
|
||||
}
|
||||
/* Parse the last file if we didn't have a fatal error above */
|
||||
if (OMPI_ERR_NOT_FOUND != OPAL_SOS_GET_ERROR_CODE(ret) && OMPI_SUCCESS != ret) {
|
||||
if (OMPI_ERR_NOT_FOUND != ret && OMPI_SUCCESS != ret) {
|
||||
ret = parse_file(str);
|
||||
}
|
||||
|
||||
@ -144,7 +145,7 @@ int ompi_btl_wv_ini_init(void)
|
||||
/* Return SUCCESS unless we got a fatal error */
|
||||
|
||||
initialized = true;
|
||||
return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) ?
|
||||
return (OMPI_SUCCESS == ret || OMPI_ERR_NOT_FOUND == ret) ?
|
||||
OMPI_SUCCESS : ret;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -19,7 +21,6 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
/*
|
||||
* Array of all possible connection functions
|
||||
@ -183,7 +184,7 @@ int ompi_btl_wv_connect_base_init(void)
|
||||
opal_output(-1, "found available cpc (SUCCESS init): %s",
|
||||
all[i]->cbc_name);
|
||||
continue;
|
||||
} else if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
} else if (OMPI_ERR_NOT_SUPPORTED == rc) {
|
||||
continue;
|
||||
} else {
|
||||
return rc;
|
||||
@ -229,8 +230,8 @@ int ompi_btl_wv_connect_base_select_for_local_port(mca_btl_wv_module_t *btl)
|
||||
strcat(msg, available[i]->cbc_name);
|
||||
|
||||
rc = available[i]->cbc_query(btl, &cpcs[cpc_index]);
|
||||
if (OMPI_ERR_NOT_SUPPORTED == OPAL_SOS_GET_ERROR_CODE(rc) ||
|
||||
OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_NOT_SUPPORTED == rc ||
|
||||
OMPI_ERR_UNREACH == rc) {
|
||||
continue;
|
||||
} else if (OMPI_SUCCESS != rc) {
|
||||
free(cpcs);
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
@ -28,7 +28,6 @@
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -702,7 +702,6 @@ OBJ_CLASS_INSTANCE(ompi_crcp_bkmrk_pml_peer_ref_t,
|
||||
void ompi_crcp_bkmrk_pml_peer_ref_construct(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref) {
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
OBJ_CONSTRUCT(&peer_ref->send_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&peer_ref->isend_list, opal_list_t);
|
||||
@ -730,7 +729,6 @@ void ompi_crcp_bkmrk_pml_peer_ref_destruct( ompi_crcp_bkmrk_pml_peer_ref_t *peer
|
||||
|
||||
peer_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
peer_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(peer_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
while( NULL != (item = opal_list_remove_first(&peer_ref->send_list)) ) {
|
||||
HOKE_TRAFFIC_MSG_REF_RETURN(item);
|
||||
@ -840,7 +838,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_construct(ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -868,7 +865,6 @@ void ompi_crcp_bkmrk_pml_traffic_message_ref_destruct( ompi_crcp_bkmrk_pml_traff
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
msg_ref->matched = INVALID_INT;
|
||||
msg_ref->done = INVALID_INT;
|
||||
@ -902,7 +898,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_construct(ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -934,7 +929,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ref_destruct( ompi_crcp_bkmrk_pml_drain_m
|
||||
|
||||
msg_ref->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ref->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
msg_ref->done = INVALID_INT;
|
||||
msg_ref->active = INVALID_INT;
|
||||
@ -954,7 +948,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_construct(ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
|
||||
void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_drain_message_ack_ref_t *msg_ack_ref) {
|
||||
@ -962,7 +955,6 @@ void ompi_crcp_bkmrk_pml_drain_message_ack_ref_destruct( ompi_crcp_bkmrk_pml_dra
|
||||
|
||||
msg_ack_ref->peer.jobid = ORTE_JOBID_INVALID;
|
||||
msg_ack_ref->peer.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(msg_ack_ref->peer.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
|
||||
|
||||
@ -1034,7 +1026,6 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
|
||||
\
|
||||
msg_ref->matched = 0; \
|
||||
msg_ref->done = 0; \
|
||||
@ -1063,7 +1054,6 @@ do { \
|
||||
\
|
||||
msg_ref->proc_name.jobid = p_jobid; \
|
||||
msg_ref->proc_name.vpid = p_vpid; \
|
||||
ORTE_EPOCH_SET(msg_ref->proc_name.epoch,orte_ess.proc_get_epoch(&(msg_ref->proc_name))); \
|
||||
}
|
||||
|
||||
|
||||
@ -1466,7 +1456,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
|
||||
|
||||
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
|
||||
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
|
||||
ORTE_EPOCH_SET(new_peer_ref->proc_name.epoch,procs[i]->proc_name.epoch);
|
||||
|
||||
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
|
||||
}
|
||||
@ -3375,7 +3364,6 @@ static int traffic_message_move(ompi_crcp_bkmrk_pml_traffic_message_ref_t *old_m
|
||||
if( NULL == from_peer_ref && NULL != to_peer_ref ) {
|
||||
(*new_msg_ref)->proc_name.jobid = to_peer_ref->proc_name.jobid;
|
||||
(*new_msg_ref)->proc_name.vpid = to_peer_ref->proc_name.vpid;
|
||||
ORTE_EPOCH_SET((*new_msg_ref)->proc_name.epoch,to_peer_ref->proc_name.epoch);
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
@ -5281,7 +5269,6 @@ static int send_bookmarks(int peer_idx)
|
||||
*/
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
|
||||
|
||||
if( NULL == (peer_ref = find_peer(peer_name))) {
|
||||
opal_output(mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -5342,7 +5329,6 @@ static int recv_bookmarks(int peer_idx)
|
||||
|
||||
peer_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
peer_name.vpid = peer_idx;
|
||||
ORTE_EPOCH_SET(peer_name.epoch,orte_ess.proc_get_epoch(&peer_name));
|
||||
|
||||
if ( 0 > (ret = orte_rml.recv_buffer_nb(&peer_name,
|
||||
OMPI_CRCP_COORD_BOOKMARK_TAG,
|
||||
@ -5524,7 +5510,6 @@ static int send_msg_details(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
|
||||
HOKE_DRAIN_ACK_MSG_REF_ALLOC(d_msg_ack, ret);
|
||||
d_msg_ack->peer.jobid = peer_ref->proc_name.jobid;
|
||||
d_msg_ack->peer.vpid = peer_ref->proc_name.vpid;
|
||||
ORTE_EPOCH_SET(d_msg_ack->peer.epoch,peer_ref->proc_name.epoch);
|
||||
|
||||
d_msg_ack->complete = false;
|
||||
opal_list_append(&drained_msg_ack_list, &(d_msg_ack->super));
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -18,7 +20,6 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
@ -41,7 +42,7 @@ int ompi_dpm_base_select(void)
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component))) {
|
||||
/* it is okay not to find any executable components */
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) {
|
||||
ret = OPAL_SUCCESS;
|
||||
}
|
||||
goto cleanup;
|
||||
|
@ -12,6 +12,8 @@
|
||||
* Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -28,7 +30,6 @@
|
||||
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_getcwd.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -65,7 +66,6 @@ static orte_process_name_t carport;
|
||||
static void recv_cb(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata);
|
||||
static void process_cb(int fd, short event, void *data);
|
||||
|
||||
/* API functions */
|
||||
static int init(void);
|
||||
@ -104,6 +104,13 @@ ompi_dpm_base_module_t ompi_dpm_orte_module = {
|
||||
finalize
|
||||
};
|
||||
|
||||
static void rml_cbfunc(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Init the module
|
||||
@ -136,7 +143,11 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
int i,j, new_proc_len;
|
||||
ompi_group_t *new_group_pointer;
|
||||
|
||||
|
||||
orte_grpcomm_coll_id_t id;
|
||||
orte_grpcomm_collective_t modex;
|
||||
opal_list_item_t *item;
|
||||
orte_namelist_t *nm;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept with port %s %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -179,6 +190,65 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
opal_progress_event_users_increment();
|
||||
|
||||
if ( rank == root ) {
|
||||
if (send_first) {
|
||||
/* Get a collective id for the modex we need later on - we
|
||||
* have to get a globally unique id for this purpose as
|
||||
* multiple threads can do simultaneous connect/accept,
|
||||
* and the same processes can be engaged in multiple
|
||||
* connect/accepts at the same time. Only one side
|
||||
* needs to do this, so have it be send_first
|
||||
*/
|
||||
nbuf = OBJ_NEW(opal_buffer_t);
|
||||
if (NULL == nbuf) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* send the request - doesn't have to include any data */
|
||||
rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, nbuf, ORTE_RML_TAG_COLL_ID_REQ, 0, rml_cbfunc, NULL);
|
||||
/* wait for the id */
|
||||
recv_completed = false;
|
||||
cabuf = OBJ_NEW(opal_buffer_t);
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID,
|
||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||
/* wait for response */
|
||||
while (!recv_completed) {
|
||||
opal_progress();
|
||||
}
|
||||
i=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cabuf);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
OBJ_RELEASE(cabuf);
|
||||
/* send it to my peer on the other side */
|
||||
nbuf = OBJ_NEW(opal_buffer_t);
|
||||
if (NULL == nbuf) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
rc = orte_rml.send_buffer_nb(&port, nbuf, tag, 0, rml_cbfunc, NULL);
|
||||
} else {
|
||||
/* wait to recv the collective id */
|
||||
recv_completed = false;
|
||||
cabuf = OBJ_NEW(opal_buffer_t);
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||
/* wait for response */
|
||||
while (!recv_completed) {
|
||||
opal_progress();
|
||||
}
|
||||
i=1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(cabuf, &id, &i, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(cabuf);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
OBJ_RELEASE(cabuf);
|
||||
}
|
||||
|
||||
/* Generate the message buffer containing the number of processes and the list of
|
||||
participating processes */
|
||||
nbuf = OBJ_NEW(opal_buffer_t);
|
||||
@ -186,6 +256,12 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* pass the collective id so we can all use it */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(nbuf, &id, 1, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.pack(nbuf, &size, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
@ -244,7 +320,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||
/* wait for response */
|
||||
ORTE_PROGRESSED_WAIT(recv_completed, 0, 1);
|
||||
while (!recv_completed) {
|
||||
opal_progress();
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept got data from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -259,7 +337,9 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, tag,
|
||||
ORTE_RML_NON_PERSISTENT, recv_cb, NULL);
|
||||
/* wait for response */
|
||||
ORTE_PROGRESSED_WAIT(recv_completed, 0, 1);
|
||||
while (!recv_completed) {
|
||||
opal_progress();
|
||||
}
|
||||
/* now send our info */
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept sending info to %s",
|
||||
@ -324,6 +404,13 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* unload the collective id */
|
||||
num_vals = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(nrbuf, &id, &num_vals, ORTE_GRPCOMM_COLL_ID_T))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
num_vals = 1;
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(nrbuf, &rsize, &num_vals, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -360,7 +447,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
for (i = 0 ; i < rsize ; ++i) {
|
||||
name = OBJ_NEW(orte_namelist_t);
|
||||
name->name = rprocs[i]->proc_name;
|
||||
opal_list_append(&all_procs, &name->item);
|
||||
opal_list_append(&all_procs, &name->super);
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept send first adding %s to allgather list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -369,7 +456,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
for (i = 0 ; i < group->grp_proc_count ; ++i) {
|
||||
name = OBJ_NEW(orte_namelist_t);
|
||||
name->name = ompi_group_peer_lookup(group, i)->proc_name;
|
||||
opal_list_append(&all_procs, &name->item);
|
||||
opal_list_append(&all_procs, &name->super);
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept send first adding %s to allgather list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -380,7 +467,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
for (i = 0 ; i < group->grp_proc_count ; ++i) {
|
||||
name = OBJ_NEW(orte_namelist_t);
|
||||
name->name = ompi_group_peer_lookup(group, i)->proc_name;
|
||||
opal_list_append(&all_procs, &name->item);
|
||||
opal_list_append(&all_procs, &name->super);
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept recv first adding %s to allgather list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -389,7 +476,7 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
for (i = 0 ; i < rsize ; ++i) {
|
||||
name = OBJ_NEW(orte_namelist_t);
|
||||
name->name = rprocs[i]->proc_name;
|
||||
opal_list_append(&all_procs, &name->item);
|
||||
opal_list_append(&all_procs, &name->super);
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept recv first adding %s to allgather list",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -402,10 +489,28 @@ static int connect_accept ( ompi_communicator_t *comm, int root,
|
||||
"%s dpm:orte:connect_accept executing modex",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_grpcomm.modex(&all_procs))) {
|
||||
/* setup the modex */
|
||||
OBJ_CONSTRUCT(&modex, orte_grpcomm_collective_t);
|
||||
modex.id = id;
|
||||
/* copy across the list of participants */
|
||||
for (item = opal_list_get_first(&all_procs);
|
||||
item != opal_list_get_end(&all_procs);
|
||||
item = opal_list_get_next(item)) {
|
||||
nm = (orte_namelist_t*)item;
|
||||
name = OBJ_NEW(orte_namelist_t);
|
||||
name->name = nm->name;
|
||||
opal_list_append(&modex.participants, &name->super);
|
||||
}
|
||||
|
||||
/* perform it */
|
||||
if (OMPI_SUCCESS != (rc = orte_grpcomm.modex(&modex))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto exit;
|
||||
}
|
||||
while (modex.active) {
|
||||
opal_progress();
|
||||
}
|
||||
OBJ_DESTRUCT(&modex);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((3, ompi_dpm_base_output,
|
||||
"%s dpm:orte:connect_accept modex complete",
|
||||
@ -1521,33 +1626,12 @@ static void recv_cb(int status, orte_process_name_t* sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag, void *cbdata)
|
||||
{
|
||||
/* don't process this right away - we need to get out of the recv before
|
||||
* we process the message as it may ask us to do something that involves
|
||||
* more messaging! Instead, setup an event so that the message gets processed
|
||||
* as soon as we leave the recv.
|
||||
*
|
||||
* The macro makes a copy of the buffer, which we release when processed - the incoming
|
||||
* buffer, however, is NOT released here, although its payload IS transferred
|
||||
* to the message buffer for later processing
|
||||
*/
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, process_cb);
|
||||
|
||||
|
||||
}
|
||||
static void process_cb(int fd, short event, void *data)
|
||||
{
|
||||
orte_message_event_t *mev = (orte_message_event_t*)data;
|
||||
|
||||
/* copy the payload to the global buffer */
|
||||
opal_dss.copy_payload(cabuf, mev->buffer);
|
||||
opal_dss.copy_payload(cabuf, buffer);
|
||||
|
||||
/* flag the identity of the remote proc */
|
||||
carport.jobid = mev->sender.jobid;
|
||||
carport.vpid = mev->sender.vpid;
|
||||
ORTE_EPOCH_SET(carport.epoch,mev->sender.epoch);
|
||||
|
||||
/* release the event */
|
||||
OBJ_RELEASE(mev);
|
||||
carport.jobid = sender->jobid;
|
||||
carport.vpid = sender->vpid;
|
||||
|
||||
/* flag complete */
|
||||
recv_completed = true;
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -24,7 +26,6 @@
|
||||
#include "mpi.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
|
||||
@ -122,7 +123,7 @@ ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win)
|
||||
|
||||
ret = ompi_osc_pt2pt_sendreq_send(module, req);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_output_verbose(5, ompi_osc_base_output,
|
||||
"complete: failure in starting sendreq (%d). Will try later.",
|
||||
ret);
|
||||
@ -267,7 +268,7 @@ ompi_osc_pt2pt_module_complete(ompi_win_t *win)
|
||||
|
||||
ret = ompi_osc_pt2pt_sendreq_send(module, req);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_output_verbose(5, ompi_osc_base_output,
|
||||
"complete: failure in starting sendreq (%d). Will try later.",
|
||||
ret);
|
||||
@ -490,7 +491,7 @@ ompi_osc_pt2pt_module_unlock(int target,
|
||||
|
||||
ret = ompi_osc_pt2pt_sendreq_send(module, req);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret) ) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_output_verbose(5, ompi_osc_base_output,
|
||||
"complete: failure in starting sendreq (%d). Will try later.",
|
||||
ret);
|
||||
|
@ -7,7 +7,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -137,7 +137,7 @@ ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count,
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, sendreq);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
OPAL_THREAD_LOCK(&module->m_lock);
|
||||
sendreq->req_module->m_num_pending_out -= 1;
|
||||
opal_list_append(&(module->m_pending_sendreqs),
|
||||
@ -209,7 +209,7 @@ ompi_osc_rdma_module_get(void *origin_addr,
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, sendreq);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
OPAL_THREAD_LOCK(&module->m_lock);
|
||||
sendreq->req_module->m_num_pending_out -= 1;
|
||||
opal_list_append(&(module->m_pending_sendreqs),
|
||||
@ -278,7 +278,7 @@ ompi_osc_rdma_module_put(void *origin_addr, int origin_count,
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, sendreq);
|
||||
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
OPAL_THREAD_LOCK(&module->m_lock);
|
||||
sendreq->req_module->m_num_pending_out -= 1;
|
||||
opal_list_append(&(module->m_pending_sendreqs),
|
||||
|
@ -7,7 +7,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -140,7 +140,7 @@ ompi_osc_rdma_module_fence(int assert, ompi_win_t *win)
|
||||
opal_list_remove_first(&(module->m_copy_pending_sendreqs));
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, req);
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_list_append(&(module->m_copy_pending_sendreqs), (opal_list_item_t*)req);
|
||||
} else if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
@ -355,7 +355,7 @@ ompi_osc_rdma_module_complete(ompi_win_t *win)
|
||||
(ompi_osc_rdma_sendreq_t*) item;
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, req);
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_list_append(&(module->m_copy_pending_sendreqs), item);
|
||||
break;
|
||||
} else if (OMPI_SUCCESS != ret) {
|
||||
@ -589,7 +589,7 @@ ompi_osc_rdma_module_unlock(int target,
|
||||
(ompi_osc_rdma_sendreq_t*) item;
|
||||
|
||||
ret = ompi_osc_rdma_sendreq_send(module, req);
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret) {
|
||||
opal_list_append(&(module->m_copy_pending_sendreqs), item);
|
||||
break;
|
||||
} else if (OMPI_SUCCESS != ret) {
|
||||
|
@ -10,6 +10,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -25,7 +27,6 @@
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/mca/mca.h"
|
||||
@ -354,7 +355,7 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
(void**) &remote_pml, &size);
|
||||
|
||||
/* if modex isn't implemented, then just assume all is well... */
|
||||
if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
|
||||
opal_output_verbose( 10, mca_pml_base_output,
|
||||
"check:select: modex not implemented");
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -14,6 +14,8 @@
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -587,7 +589,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
opal_list_append(&mca_pml_bfo.pckt_pending,
|
||||
(opal_list_item_t*)pckt);
|
||||
@ -608,7 +610,7 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
#else /* PML_BFO */
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
#endif /* PML_BFO */
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -640,7 +642,7 @@ void mca_pml_bfo_process_pending_rdma(void)
|
||||
} else {
|
||||
rc = mca_pml_bfo_recv_request_get_frag(frag);
|
||||
}
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -839,7 +841,7 @@ void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||||
if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)))
|
||||
if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc))
|
||||
return;
|
||||
}
|
||||
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||||
@ -897,7 +899,7 @@ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||||
if (OMPI_SUCCESS == rc) {
|
||||
return;
|
||||
} else if (OMPI_ERR_OUT_OF_RESOURCE == (OPAL_SOS_GET_ERROR_CODE(rc))) {
|
||||
} else if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
opal_output_verbose(30, mca_pml_bfo_output,
|
||||
"Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d",
|
||||
(int)sendreq->req_endpoint->btl_eager.arr_size);
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -56,7 +58,7 @@ void mca_pml_bfo_recv_request_process_pending(void)
|
||||
break;
|
||||
recvreq->req_pending = false;
|
||||
rc = mca_pml_bfo_recv_request_schedule_exclusive(recvreq, NULL);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -433,7 +435,7 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag )
|
||||
/* queue up get request */
|
||||
rc = mca_bml_base_get(bml_btl,descriptor);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
opal_list_append(&mca_pml_bfo.rdma_pending,
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -369,7 +371,7 @@ static inline int mca_pml_bfo_recv_request_schedule_exclusive(
|
||||
|
||||
do {
|
||||
rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_recv_request(req));
|
||||
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -57,7 +59,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
switch(pending_type) {
|
||||
case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
|
||||
rc = mca_pml_bfo_send_request_schedule_exclusive(sendreq);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -70,7 +72,7 @@ void mca_pml_bfo_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
MCA_PML_BFO_SEND_PENDING_START, true);
|
||||
} else {
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, send_dst);
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
/* No more resources on this btl so prepend to the pending
|
||||
* list to minimize reordering and give up for now. */
|
||||
add_request_to_send_pending(sendreq,
|
||||
@ -618,8 +620,7 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_RESOURCE_BUSY == rc) {
|
||||
/* No more resources. Allow the upper level to queue the send */
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
@ -1311,7 +1312,7 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
frag->rdma_length = save_size;
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||||
opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -293,7 +295,7 @@ mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq)
|
||||
int rc;
|
||||
do {
|
||||
rc = mca_pml_bfo_send_request_schedule_once(sendreq);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_send_request(sendreq));
|
||||
|
||||
@ -458,7 +460,7 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
|
||||
return rc;
|
||||
}
|
||||
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
@ -586,7 +586,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_CSUM_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
|
||||
opal_list_append(&mca_pml_csum.pckt_pending,
|
||||
(opal_list_item_t*)pckt);
|
||||
@ -599,7 +599,7 @@ void mca_pml_csum_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_fin.hdr_des,
|
||||
pckt->order,
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -631,7 +631,7 @@ void mca_pml_csum_process_pending_rdma(void)
|
||||
} else {
|
||||
rc = mca_pml_csum_recv_request_get_frag(frag);
|
||||
}
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -83,7 +83,7 @@ void mca_pml_csum_recv_request_process_pending(void)
|
||||
break;
|
||||
recvreq->req_pending = false;
|
||||
rc = mca_pml_csum_recv_request_schedule_exclusive(recvreq, NULL);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -425,7 +425,7 @@ int mca_pml_csum_recv_request_get_frag( mca_pml_csum_rdma_frag_t* frag )
|
||||
/* queue up get request */
|
||||
rc = mca_bml_base_get(bml_btl,descriptor);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
|
||||
opal_list_append(&mca_pml_csum.rdma_pending,
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -346,7 +348,7 @@ static inline int mca_pml_csum_recv_request_schedule_exclusive(
|
||||
|
||||
do {
|
||||
rc = mca_pml_csum_recv_request_schedule_once(req, start_bml_btl);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_recv_request(req));
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -66,7 +66,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
switch(pending_type) {
|
||||
case MCA_PML_CSUM_SEND_PENDING_SCHEDULE:
|
||||
rc = mca_pml_csum_send_request_schedule_exclusive(sendreq);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -79,7 +79,7 @@ void mca_pml_csum_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
MCA_PML_CSUM_SEND_PENDING_START, true);
|
||||
} else {
|
||||
rc = mca_pml_csum_send_request_start_btl(sendreq, send_dst);
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
/* No more resources on this btl so prepend to the pending
|
||||
* list to minimize reordering and give up for now. */
|
||||
add_request_to_send_pending(sendreq,
|
||||
@ -590,7 +590,7 @@ int mca_pml_csum_send_request_start_copy( mca_pml_csum_send_request_t* sendreq,
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
switch(OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
switch(rc) {
|
||||
case OMPI_ERR_RESOURCE_BUSY:
|
||||
/* No more resources. Allow the upper level to queue the send */
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -1256,7 +1256,7 @@ int mca_pml_csum_send_request_put_frag( mca_pml_csum_rdma_frag_t* frag )
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
frag->rdma_length = save_size;
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_csum.lock);
|
||||
opal_list_append(&mca_pml_csum.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_csum.lock);
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -287,7 +287,7 @@ mca_pml_csum_send_request_schedule_exclusive(mca_pml_csum_send_request_t* sendre
|
||||
int rc;
|
||||
do {
|
||||
rc = mca_pml_csum_send_request_schedule_once(sendreq);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_send_request(sendreq));
|
||||
|
||||
@ -434,7 +434,7 @@ mca_pml_csum_send_request_start( mca_pml_csum_send_request_t* sendreq )
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_csum_send_request_start_btl(sendreq, bml_btl);
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
|
||||
return rc;
|
||||
}
|
||||
add_request_to_send_pending(sendreq, MCA_PML_CSUM_SEND_PENDING_START, true);
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Mellanox Technologies.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -102,7 +104,7 @@ static void mca_pml_dr_error_completion(
|
||||
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata;
|
||||
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
||||
|
||||
switch(OPAL_SOS_GET_ERROR_CODE(status)) {
|
||||
switch(status) {
|
||||
case OMPI_ERR_UNREACH:
|
||||
/**
|
||||
* peer is no longer reachable through this btl
|
||||
|
@ -14,6 +14,8 @@
|
||||
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -558,7 +560,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_ack.hdr_dst_req.pval,
|
||||
pckt->hdr.hdr_ack.hdr_send_offset,
|
||||
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.pckt_pending,
|
||||
(opal_list_item_t*)pckt);
|
||||
@ -571,7 +573,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
pckt->hdr.hdr_fin.hdr_des,
|
||||
pckt->order,
|
||||
pckt->hdr.hdr_fin.hdr_fail);
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
|
||||
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -603,7 +605,7 @@ void mca_pml_ob1_process_pending_rdma(void)
|
||||
} else {
|
||||
rc = mca_pml_ob1_recv_request_get_frag(frag);
|
||||
}
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,8 @@
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -54,7 +56,7 @@ void mca_pml_ob1_recv_request_process_pending(void)
|
||||
break;
|
||||
recvreq->req_pending = false;
|
||||
rc = mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -391,7 +393,7 @@ int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag )
|
||||
/* queue up get request */
|
||||
rc = mca_bml_base_get(bml_btl,descriptor);
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
mca_bml_base_free(bml_btl, descriptor);
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending,
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -347,7 +349,7 @@ static inline int mca_pml_ob1_recv_request_schedule_exclusive(
|
||||
|
||||
do {
|
||||
rc = mca_pml_ob1_recv_request_schedule_once(req, start_bml_btl);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_recv_request(req));
|
||||
|
||||
|
@ -58,7 +58,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
switch(pending_type) {
|
||||
case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
|
||||
rc = mca_pml_ob1_send_request_schedule_exclusive(sendreq);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@ -71,7 +71,7 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
|
||||
MCA_PML_OB1_SEND_PENDING_START, true);
|
||||
} else {
|
||||
rc = mca_pml_ob1_send_request_start_btl(sendreq, send_dst);
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
/* No more resources on this btl so prepend to the pending
|
||||
* list to minimize reordering and give up for now. */
|
||||
add_request_to_send_pending(sendreq,
|
||||
@ -550,7 +550,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (OMPI_ERR_RESOURCE_BUSY == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_RESOURCE_BUSY == rc) {
|
||||
/* No more resources. Allow the upper level to queue the send */
|
||||
rc = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
@ -1192,7 +1192,7 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
|
||||
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
|
||||
mca_bml_base_free(bml_btl, des);
|
||||
frag->rdma_length = save_size;
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -285,7 +287,7 @@ mca_pml_ob1_send_request_schedule_exclusive(mca_pml_ob1_send_request_t* sendreq)
|
||||
int rc;
|
||||
do {
|
||||
rc = mca_pml_ob1_send_request_schedule_once(sendreq);
|
||||
if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
if(rc == OMPI_ERR_OUT_OF_RESOURCE)
|
||||
break;
|
||||
} while(!unlock_send_request(sendreq));
|
||||
|
||||
@ -444,7 +446,7 @@ mca_pml_ob1_send_request_start( mca_pml_ob1_send_request_t* sendreq )
|
||||
/* select a btl */
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
|
||||
if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
|
||||
return rc;
|
||||
}
|
||||
add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true);
|
||||
|
@ -7,6 +7,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -18,7 +20,6 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
@ -41,7 +42,7 @@ int ompi_pubsub_base_select(void)
|
||||
(mca_base_module_t **) &best_module,
|
||||
(mca_base_component_t **) &best_component))) {
|
||||
/* it is okay not to find any executable components */
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_NOT_FOUND == ret) {
|
||||
ret = OPAL_SUCCESS;
|
||||
}
|
||||
goto cleanup;
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +28,6 @@
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/dss/dss.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -121,7 +123,7 @@ int mca_rcache_rb_insert (
|
||||
if(flags & MCA_MPOOL_FLAGS_CACHE) {
|
||||
rc = mca_rcache_rb_mru_insert( (mca_rcache_rb_module_t*) rcache, reg);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if(OMPI_ERR_TEMP_OUT_OF_RESOURCE == rc) {
|
||||
/*
|
||||
* If the registration is too big for the rcache,
|
||||
* don't cache it and reset the flags so the upper level
|
||||
|
@ -1,6 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -10,7 +12,6 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "vprotocol_pessimist_eventlog.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/base/rml_contact.h"
|
||||
|
@ -9,6 +9,8 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -25,8 +27,6 @@
|
||||
#include "ompi/info/info.h"
|
||||
#include "ompi/mca/pubsub/pubsub.h"
|
||||
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#if OPAL_HAVE_WEAK_SYMBOLS && OMPI_PROFILING_DEFINES
|
||||
#pragma weak MPI_Unpublish_name = PMPI_Unpublish_name
|
||||
#endif
|
||||
@ -68,13 +68,13 @@ int MPI_Unpublish_name(char *service_name, MPI_Info info,
|
||||
*/
|
||||
rc = ompi_pubsub.unpublish(service_name, info);
|
||||
if ( OMPI_SUCCESS != rc ) {
|
||||
if (OMPI_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_NOT_FOUND == rc) {
|
||||
/* service couldn't be found */
|
||||
OPAL_CR_EXIT_LIBRARY();
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_SERVICE,
|
||||
FUNC_NAME);
|
||||
}
|
||||
if (OMPI_ERR_PERM == OPAL_SOS_GET_ERROR_CODE(rc)) {
|
||||
if (OMPI_ERR_PERM == rc) {
|
||||
/* this process didn't own the specified service */
|
||||
OPAL_CR_EXIT_LIBRARY();
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ACCESS,
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +28,6 @@
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -108,7 +109,6 @@ int ompi_proc_init(void)
|
||||
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
proc->proc_name.vpid = i;
|
||||
ORTE_EPOCH_SET(proc->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
|
||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||
ompi_proc_local_proc = proc;
|
||||
@ -170,7 +170,7 @@ int ompi_proc_complete_init(void)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
} else if (OMPI_ERR_NOT_IMPLEMENTED == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
} else if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
|
||||
proc->proc_arch = opal_local_arch;
|
||||
} else {
|
||||
errcode = ret;
|
||||
@ -362,7 +362,6 @@ int ompi_proc_refresh(void) {
|
||||
|
||||
/* Does not change: proc->proc_name.vpid */
|
||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
ORTE_EPOCH_SET(proc->proc_name.epoch,orte_ess.proc_get_epoch(&proc->proc_name));
|
||||
|
||||
/* Make sure to clear the local flag before we set it below */
|
||||
proc->proc_flags = 0;
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
@ -87,6 +87,7 @@
|
||||
#endif
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
|
||||
int ompi_mpi_finalize(void)
|
||||
{
|
||||
int ret, value;
|
||||
@ -94,6 +95,7 @@ int ompi_mpi_finalize(void)
|
||||
opal_list_item_t *item;
|
||||
struct timeval ompistart, ompistop;
|
||||
bool timing = false;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
|
||||
/* Be a bit social if an erroneous program calls MPI_FINALIZE in
|
||||
two different threads, otherwise we may deadlock in
|
||||
@ -229,11 +231,19 @@ int ompi_mpi_finalize(void)
|
||||
MPI barrier doesn't ensure that all messages have been transmitted
|
||||
before exiting, so the possibility of a stranded message exists.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.peer_fini_barrier;
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* wait for barrier to complete */
|
||||
while (coll->active) {
|
||||
opal_progress(); /* block in progress pending events */
|
||||
}
|
||||
OBJ_RELEASE(coll);
|
||||
|
||||
/* check for timing request - get stop time and report elapsed
|
||||
time if so */
|
||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
|
@ -101,6 +101,7 @@
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
/* This is required for the boundaries of the hash tables used to store
|
||||
* the F90 types returned by the MPI_Type_create_f90_XXX functions.
|
||||
@ -290,6 +291,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
struct timeval ompistart, ompistop;
|
||||
char *event_val = NULL;
|
||||
bool orte_setup = false;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
|
||||
/* bitflag of the thread level support provided. To be used
|
||||
* for the modex in order to work in heterogeneous environments. */
|
||||
@ -547,10 +549,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
/* exchange connection info - this function also acts as a barrier
|
||||
* as it will not return until the exchange is complete
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.peer_modex;
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) {
|
||||
error = "orte_grpcomm_modex failed";
|
||||
goto error;
|
||||
}
|
||||
/* wait for modex to complete - this may be moved anywhere in mpi_init
|
||||
* so long as it occurs prior to calling a function that needs
|
||||
* the modex info!
|
||||
*/
|
||||
while (coll->active) {
|
||||
opal_progress(); /* block in progress pending events */
|
||||
}
|
||||
OBJ_RELEASE(coll);
|
||||
|
||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
gettimeofday(&ompistop, NULL);
|
||||
@ -897,7 +909,7 @@ MOVEON:
|
||||
/* If we got "unreachable", then print a specific error message.
|
||||
Otherwise, if we got some other failure, fall through to print
|
||||
a generic message. */
|
||||
if (OMPI_ERR_UNREACH == OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
if (OMPI_ERR_UNREACH == ret) {
|
||||
orte_show_help("help-mpi-runtime",
|
||||
"mpi_init:startup:pml-add-procs-fail", true);
|
||||
error = NULL;
|
||||
@ -934,11 +946,18 @@ MOVEON:
|
||||
}
|
||||
|
||||
/* wait for everyone to reach this point */
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.peer_init_barrier;
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) {
|
||||
error = "orte_grpcomm_barrier failed";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* wait for barrier to complete */
|
||||
while (coll->active) {
|
||||
opal_progress(); /* block in progress pending events */
|
||||
}
|
||||
OBJ_RELEASE(coll);
|
||||
|
||||
/* check for timing request - get stop time and report elapsed
|
||||
time if so, then start the clock again */
|
||||
if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -46,7 +46,6 @@
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/util/cmd_line.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/daemon_init.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
@ -287,7 +286,9 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* wait to hear we are done */
|
||||
opal_event_dispatch(opal_event_base);
|
||||
while (orte_event_base_active) {
|
||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||
}
|
||||
|
||||
/* should never get here, but if we do... */
|
||||
|
||||
|
@ -101,6 +101,8 @@
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/grpcomm/base/base.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -396,6 +398,14 @@ void ompi_info_open_components(void)
|
||||
*/
|
||||
orte_process_info.proc_type = ORTE_PROC_HNP;
|
||||
|
||||
if (ORTE_SUCCESS != orte_state_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
map = OBJ_NEW(ompi_info_component_map_t);
|
||||
map->type = strdup("state");
|
||||
map->components = &orte_state_base_components_available;
|
||||
opal_pointer_array_add(&component_map, map);
|
||||
|
||||
if (ORTE_SUCCESS != orte_errmgr_base_open()) {
|
||||
goto error;
|
||||
}
|
||||
@ -789,7 +799,8 @@ void ompi_info_close_components()
|
||||
|
||||
#endif
|
||||
(void) orte_errmgr_base_close();
|
||||
|
||||
(void) orte_state_base_close();
|
||||
|
||||
(void) opal_backtrace_base_close();
|
||||
(void) opal_memory_base_close();
|
||||
(void) opal_memchecker_base_close();
|
||||
|
@ -268,6 +268,7 @@ int main(int argc, char *argv[])
|
||||
opal_pointer_array_add(&mca_types, "filem");
|
||||
#endif
|
||||
/* these are always included */
|
||||
opal_pointer_array_add(&mca_types, "state");
|
||||
opal_pointer_array_add(&mca_types, "errmgr");
|
||||
opal_pointer_array_add(&mca_types, "ess");
|
||||
opal_pointer_array_add(&mca_types, "grpcomm");
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -27,7 +29,6 @@
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_component_repository.h"
|
||||
@ -392,7 +393,7 @@ static int open_components(const char *type_name, int output_id,
|
||||
"mca: base: components_open: "
|
||||
"component %s register function successful",
|
||||
component->mca_component_name);
|
||||
} else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
} else if (OPAL_ERR_NOT_AVAILABLE != ret) {
|
||||
/* If the component returns OPAL_ERR_NOT_AVAILABLE,
|
||||
it's a cue to "silently ignore me" -- it's not a
|
||||
failure, it's just a way for the component to say
|
||||
@ -432,7 +433,7 @@ static int open_components(const char *type_name, int output_id,
|
||||
"mca: base: components_open: "
|
||||
"component %s open function successful",
|
||||
component->mca_component_name);
|
||||
} else if (OPAL_ERR_NOT_AVAILABLE != OPAL_SOS_GET_ERROR_CODE(ret)) {
|
||||
} else if (OPAL_ERR_NOT_AVAILABLE != ret) {
|
||||
/* If the component returns OPAL_ERR_NOT_AVAILABLE,
|
||||
it's a cue to "silently ignore me" -- it's not a
|
||||
failure, it's just a way for the component to say
|
||||
|
@ -2,6 +2,8 @@
|
||||
* Copyright (c) 2004-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -18,7 +20,6 @@
|
||||
#include "opal/mca/compress/compress.h"
|
||||
#include "opal/mca/compress/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/mca/compress/base/static-components.h"
|
||||
|
||||
@ -84,7 +85,7 @@ int opal_compress_base_open(void)
|
||||
mca_compress_base_static_components,
|
||||
&opal_compress_base_components_available,
|
||||
true)) ) {
|
||||
if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) &&
|
||||
if( OPAL_ERR_NOT_FOUND == ret &&
|
||||
NULL != str_value &&
|
||||
0 == strncmp(str_value, "none", strlen("none")) ) {
|
||||
exit_status = OPAL_SUCCESS;
|
||||
|
@ -8,6 +8,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -28,7 +30,6 @@
|
||||
#include "opal/mca/crs/crs.h"
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#include "opal/mca/crs/base/static-components.h"
|
||||
|
||||
@ -95,7 +96,7 @@ int opal_crs_base_open(void)
|
||||
mca_crs_base_static_components,
|
||||
&opal_crs_base_components_available,
|
||||
true)) ) {
|
||||
if( OPAL_ERR_NOT_FOUND == OPAL_SOS_GET_ERROR_CODE(ret) &&
|
||||
if( OPAL_ERR_NOT_FOUND == ret &&
|
||||
NULL != str_value &&
|
||||
0 == strncmp(str_value, "none", strlen("none")) ) {
|
||||
exit_status = OPAL_SUCCESS;
|
||||
|
@ -1,5 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -12,6 +14,8 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
/*
|
||||
@ -95,7 +99,6 @@ OPAL_DECLSPEC int opal_event_base_close(void);
|
||||
OPAL_DECLSPEC extern int opal_event_base_output;
|
||||
OPAL_DECLSPEC extern opal_list_t opal_event_components;
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OPAL_BASE_EVENT_H */
|
||||
|
@ -21,9 +21,6 @@ int opal_event_base_close(void)
|
||||
|
||||
opal_event_base_inited--;
|
||||
|
||||
/* release the event base */
|
||||
opal_event_base_finalize(opal_event_base);
|
||||
|
||||
/* no need to close the component as it was statically opened */
|
||||
|
||||
/* for support of tools such as ompi_info */
|
||||
|
@ -79,7 +79,12 @@ int opal_event_base_open(void)
|
||||
|
||||
/* get our event base */
|
||||
if (NULL == (opal_event_base = opal_event_base_create())) {
|
||||
rc = OPAL_ERROR;
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* set the number of priorities */
|
||||
if (0 < OPAL_EVENT_NUM_PRI) {
|
||||
opal_event_base_priority_init(opal_event_base, OPAL_EVENT_NUM_PRI);
|
||||
}
|
||||
|
||||
return rc;
|
||||
|
@ -25,6 +25,8 @@
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
|
||||
@ -38,6 +40,17 @@ typedef unsigned char u_char;
|
||||
typedef unsigned short u_short;
|
||||
#endif
|
||||
|
||||
/* set the number of event priority levels */
|
||||
#define OPAL_EVENT_NUM_PRI 8
|
||||
|
||||
#define OPAL_EV_ERROR_PRI 0
|
||||
#define OPAL_EV_MSG_HI_PRI 1
|
||||
#define OPAL_EV_SYS_HI_PRI 2
|
||||
#define OPAL_EV_MSG_LO_PRI 3
|
||||
#define OPAL_EV_SYS_LO_PRI 4
|
||||
#define OPAL_EV_INFO_HI_PRI 5
|
||||
#define OPAL_EV_INFO_LO_PRI 6
|
||||
#define OPAL_EV_LOWEST_PRI 7
|
||||
|
||||
#define OPAL_EVENT_SIGNAL(ev) opal_event_get_signal(ev)
|
||||
|
||||
|
@ -87,8 +87,8 @@ AC_DEFUN([MCA_opal_event_libevent2013_CONFIG],[
|
||||
|
||||
AC_ARG_ENABLE(event-debug,
|
||||
AC_HELP_STRING([--enable-event-debug], [enable event library debug output]))
|
||||
if test "$enable_event_debug" = "no"; then
|
||||
event_args="$event_args --disable-debug-mode"
|
||||
if test "$enable_event_debug" = "yes"; then
|
||||
event_args="$event_args --enable-debug-mode"
|
||||
fi
|
||||
|
||||
AC_ARG_ENABLE(event-thread-support,
|
||||
|
@ -1519,9 +1519,6 @@ event_base_loop(struct event_base *base, int flags)
|
||||
* as we invoke user callbacks. */
|
||||
EVBASE_ACQUIRE_LOCK(base, th_base_lock);
|
||||
|
||||
/**** OMPI CHANGE ****/
|
||||
/* Disable reentrant check */
|
||||
#if 0
|
||||
if (base->running_loop) {
|
||||
event_warnx("%s: reentrant invocation. Only one event_base_loop"
|
||||
" can run on each event_base at once.", __func__);
|
||||
@ -1530,8 +1527,6 @@ event_base_loop(struct event_base *base, int flags)
|
||||
}
|
||||
|
||||
base->running_loop = 1;
|
||||
#endif
|
||||
/**** END OMPI CHANGE ****/
|
||||
|
||||
clear_time_cache(base);
|
||||
|
||||
@ -2148,14 +2143,8 @@ event_del(struct event *ev)
|
||||
int res;
|
||||
|
||||
if (EVUTIL_FAILURE_CHECK(!ev->ev_base)) {
|
||||
/**** OMPI CHANGE ****/
|
||||
/* Disable warning and return 0 */
|
||||
return 0;
|
||||
#if 0
|
||||
event_warnx("%s: event has no event_base set.", __func__);
|
||||
return -1;
|
||||
#endif
|
||||
/**** END OMPI CHANGE ****/
|
||||
}
|
||||
|
||||
EVBASE_ACQUIRE_LOCK(ev->ev_base, th_base_lock);
|
||||
|
@ -1,6 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -57,49 +59,14 @@
|
||||
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
typedef struct event opal_event_t;
|
||||
/*** Overload the event_base_t struct ***/
|
||||
/* This may (hopefully) be a temporary change
|
||||
* to deal with cross-base sync. Specifically,
|
||||
* when an event in one base needs to release
|
||||
* a condition_wait in another base, we need
|
||||
* to "wakeup" the event base in the second base
|
||||
* so the condition_wait can be checked
|
||||
*
|
||||
* On a more permanent level, use this to update
|
||||
* the event base when it is being progressed in
|
||||
* a separate thread.
|
||||
*/
|
||||
typedef struct {
|
||||
struct event_base *base;
|
||||
opal_event_t update_event;
|
||||
int update_pipe[2];
|
||||
} opal_event_base_t;
|
||||
typedef event_callback_fn opal_event_cbfunc_t;
|
||||
|
||||
typedef struct {
|
||||
opal_event_t *ev;
|
||||
uint8_t op;
|
||||
} opal_event_update_t;
|
||||
|
||||
#define OPAL_EVENT_NOOP 0x00
|
||||
#define OPAL_EVENT_ADD 0x01
|
||||
#define OPAL_EVENT_DEL 0x02
|
||||
|
||||
#if OPAL_EVENT_HAVE_THREAD_SUPPORT
|
||||
#define OPAL_UPDATE_EVBASE(b, evt, ad)
|
||||
#else
|
||||
#define OPAL_UPDATE_EVBASE(b, evt, ad) \
|
||||
do { \
|
||||
opal_event_update_t up; \
|
||||
up.ev = (evt); \
|
||||
up.op = (ad); \
|
||||
opal_fd_write((b)->update_pipe[1], sizeof(opal_event_update_t), &up); \
|
||||
} while(0);
|
||||
#endif
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* Temporary global - will be replaced by layer-specific event bases */
|
||||
typedef struct event_base opal_event_base_t;
|
||||
typedef struct event opal_event_t;
|
||||
|
||||
OPAL_DECLSPEC extern opal_event_base_t *opal_event_base;
|
||||
|
||||
#define OPAL_EV_TIMEOUT EV_TIMEOUT
|
||||
@ -114,14 +81,19 @@ OPAL_DECLSPEC extern opal_event_base_t *opal_event_base;
|
||||
|
||||
/* Global function to create and release an event base */
|
||||
OPAL_DECLSPEC opal_event_base_t* opal_event_base_create(void);
|
||||
OPAL_DECLSPEC void opal_event_base_finalize(opal_event_base_t *base);
|
||||
|
||||
#define opal_event_base_free(x) event_base_free(x)
|
||||
|
||||
OPAL_DECLSPEC int opal_event_init(void);
|
||||
|
||||
OPAL_DECLSPEC int opal_event_reinit(opal_event_base_t *base);
|
||||
#define opal_event_reinit(b) event_reinit((b))
|
||||
|
||||
OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase,
|
||||
struct timeval *tv_in);
|
||||
#define opal_event_base_init_common_timeout (b, t) event_base_init_common_timeout((b), (t))
|
||||
|
||||
/* Event priority APIs */
|
||||
#define opal_event_base_priority_init(b, n) event_base_priority_init((b), (n))
|
||||
|
||||
#define opal_event_set_priority(x, n) event_priority_set((x), (n))
|
||||
|
||||
/* thread support APIs */
|
||||
#if OPAL_EVENT_HAVE_THREAD_SUPPORT
|
||||
@ -135,9 +107,11 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba
|
||||
#endif
|
||||
|
||||
/* Basic event APIs */
|
||||
#define opal_event_enable_debug_mode() event_enable_debug_mode()
|
||||
|
||||
#define opal_event_set_debug_output(x) event_set_debug_output((x))
|
||||
|
||||
#define opal_event_set(b, ev, fd, fg, cb, arg) event_assign((ev), (b)->base, (fd), (fg), (event_callback_fn) (cb), (arg))
|
||||
#define opal_event_set(b, x, fd, fg, cb, arg) event_assign((x), (b), (fd), (fg), (event_callback_fn) (cb), (arg))
|
||||
|
||||
#define opal_event_add(ev, tv) event_add((ev), (tv))
|
||||
|
||||
@ -145,39 +119,39 @@ OPAL_DECLSPEC struct timeval *opal_event_base_init_common_timeout (opal_event_ba
|
||||
|
||||
#define opal_event_active(x, y, z) event_active((x), (y), (z))
|
||||
|
||||
#define opal_event_new(b, fd, fg, cb, arg) event_new((b)->base, (fd), (fg), (event_callback_fn) (cb), (arg))
|
||||
#define opal_event_new(b, fd, fg, cb, arg) event_new((b), (fd), (fg), (event_callback_fn) (cb), (arg))
|
||||
|
||||
OPAL_DECLSPEC opal_event_t* opal_event_alloc(void);
|
||||
|
||||
#define opal_event_free(x) event_free((x))
|
||||
|
||||
/* Timer APIs */
|
||||
#define opal_event_evtimer_new(b, cb, arg) event_new((b)->base, -1, 0, (event_callback_fn) (cb), (arg))
|
||||
#define opal_event_evtimer_new(b, cb, arg) opal_event_new((b), -1, 0, (cb), (arg))
|
||||
|
||||
#define opal_event_evtimer_add(ev, tv) event_add((ev), (tv))
|
||||
#define opal_event_evtimer_add(x, tv) opal_event_add((x), (tv))
|
||||
|
||||
#define opal_event_evtimer_set(b, ev, cb, arg) event_assign((ev), (b)->base, -1, 0, (event_callback_fn) (cb), (arg))
|
||||
#define opal_event_evtimer_set(b, x, cb, arg) event_assign((x), (b), -1, 0, (event_callback_fn) (cb), (arg))
|
||||
|
||||
#define opal_event_evtimer_del(ev) event_del((ev))
|
||||
#define opal_event_evtimer_del(x) opal_event_del((x))
|
||||
|
||||
#define opal_event_evtimer_pending(ev, tv) event_pending((ev), EV_TIMEOUT, (tv))
|
||||
#define opal_event_evtimer_pending(x, tv) event_pending((x), EV_TIMEOUT, (tv))
|
||||
|
||||
#define opal_event_evtimer_initialized(ev) event_initialized((ev))
|
||||
#define opal_event_evtimer_initialized(x) event_initialized((x))
|
||||
|
||||
/* Signal APIs */
|
||||
#define opal_event_signal_add(ev, tv) event_add((ev), (tv))
|
||||
#define opal_event_signal_add(x, tv) event_add((x), (tv))
|
||||
|
||||
#define opal_event_signal_set(b, ev, fd, cb, arg) event_assign((ev), (b)->base, (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg))
|
||||
#define opal_event_signal_set(b, x, fd, cb, arg) event_assign((x), (b), (fd), EV_SIGNAL|EV_PERSIST, (event_callback_fn) (cb), (arg))
|
||||
|
||||
#define opal_event_signal_del(ev) event_del((ev))
|
||||
#define opal_event_signal_del(x) event_del((x))
|
||||
|
||||
#define opal_event_signal_pending(ev, tv) event_pending((ev), EV_SIGNAL, (tv))
|
||||
#define opal_event_signal_pending(x, tv) event_pending((x), EV_SIGNAL, (tv))
|
||||
|
||||
#define opal_event_signal_initalized(ev) event_initialized((ev))
|
||||
#define opal_event_signal_initalized(x) event_initialized((x))
|
||||
|
||||
#define opal_event_get_signal(ev) event_get_signal((ev))
|
||||
#define opal_event_get_signal(x) event_get_signal((x))
|
||||
|
||||
#define opal_event_loop(b, fg) event_base_loop((b->base), (fg))
|
||||
|
||||
#define opal_event_dispatch(b) event_base_loop((b)->base, 0)
|
||||
#define opal_event_loop(b, fg) event_base_loop((b), (fg))
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -109,83 +109,16 @@ static const struct eventop *eventops[] = {
|
||||
|
||||
static struct event_config *config=NULL;
|
||||
|
||||
static void update_event(int fd, short flags, void* arg)
|
||||
{
|
||||
opal_event_update_t up;
|
||||
|
||||
/* read the event */
|
||||
opal_fd_read(fd, sizeof(opal_event_update_t), &up);
|
||||
if (NULL == up.ev) {
|
||||
return;
|
||||
}
|
||||
if (OPAL_EVENT_ADD == up.op) {
|
||||
event_add(up.ev, 0);
|
||||
} else if (OPAL_EVENT_DEL == up.op) {
|
||||
event_del(up.ev);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Public function -- not part of the module */
|
||||
/* This includes (hopefully) a temporary change
|
||||
* to deal with cross-base sync. Specifically,
|
||||
* when an event in one base needs to release
|
||||
* a condition_wait in another base, we need
|
||||
* to "wakeup" the event base in the second base
|
||||
* so the condition_wait can be checked
|
||||
*/
|
||||
opal_event_base_t* opal_event_base_create(void)
|
||||
{
|
||||
struct event_base *base;
|
||||
opal_event_base_t *evbase;
|
||||
opal_event_base_t *base;
|
||||
|
||||
base = event_base_new_with_config(config);
|
||||
if (NULL == base) {
|
||||
/* there is no backend method that does what we want */
|
||||
opal_output(0, "No event method available");
|
||||
return NULL;
|
||||
}
|
||||
evbase = (opal_event_base_t*)malloc(sizeof(opal_event_base_t));
|
||||
evbase->base = base;
|
||||
#ifndef __WINDOWS__
|
||||
if (pipe(evbase->update_pipe) < 0) {
|
||||
opal_output(0, "Unable to open update pipe");
|
||||
free(evbase);
|
||||
event_base_free(base);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
if (create_socketpair(AF_UNIX, SOCK_STREAM, 0, evbase->update_pipe) == -1) {
|
||||
opal_output(0, "Unable to open update socket");
|
||||
free(evbase);
|
||||
event_base_free(base);
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
event_assign(&evbase->update_event, base,
|
||||
evbase->update_pipe[0], EV_READ | EV_PERSIST,
|
||||
update_event, NULL);
|
||||
event_add(&evbase->update_event, 0);
|
||||
return evbase;
|
||||
}
|
||||
|
||||
void opal_event_base_finalize(opal_event_base_t *evbase)
|
||||
{
|
||||
/* delete the wakeup event */
|
||||
event_del(&evbase->update_event);
|
||||
#ifndef __WINDOWS__
|
||||
/* close the pipe */
|
||||
close(evbase->update_pipe[0]);
|
||||
close(evbase->update_pipe[1]);
|
||||
#else
|
||||
/* close the socket */
|
||||
closesocket(evbase->update_pipe[0]);
|
||||
closesocket(evbase->update_pipe[1]);
|
||||
#endif
|
||||
/* release the base */
|
||||
event_base_free(evbase->base);
|
||||
/* free the storage */
|
||||
free(evbase);
|
||||
return base;
|
||||
}
|
||||
|
||||
int opal_event_init(void)
|
||||
@ -304,14 +237,10 @@ int opal_event_init(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int opal_event_reinit(opal_event_base_t *evbase)
|
||||
opal_event_t* opal_event_alloc(void)
|
||||
{
|
||||
return event_reinit(evbase->base);
|
||||
}
|
||||
opal_event_t *ev;
|
||||
|
||||
struct timeval *opal_event_base_init_common_timeout (opal_event_base_t *evbase,
|
||||
struct timeval *tv_in)
|
||||
{
|
||||
return (struct timeval*)event_base_init_common_timeout (evbase->base, tv_in);
|
||||
ev = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
return ev;
|
||||
}
|
||||
|
||||
|
@ -1456,5 +1456,6 @@ char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality)
|
||||
ptr->buffers[ptr->cntr][idx++] = 'K';
|
||||
ptr->buffers[ptr->cntr][idx++] = '\0';
|
||||
}
|
||||
|
||||
return ptr->buffers[ptr->cntr];
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -31,7 +31,6 @@
|
||||
#include "opal/util/net.h"
|
||||
#include "opal/util/keyval_parse.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/memoryhooks/memory.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
@ -87,9 +86,6 @@ opal_finalize_util(void)
|
||||
/* finalize the trace system */
|
||||
opal_trace_finalize();
|
||||
|
||||
/* finalize the OPAL SOS system */
|
||||
opal_sos_finalize();
|
||||
|
||||
/* finalize the show_help system */
|
||||
opal_show_help_finalize();
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2007-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2010 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -60,7 +60,6 @@
|
||||
#include "opal/util/stacktrace.h"
|
||||
#include "opal/util/keyval_parse.h"
|
||||
#include "opal/util/sys_limits.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
|
||||
#if OPAL_CC_USE_PRAGMA_IDENT
|
||||
#pragma ident OPAL_IDENT_STRING
|
||||
@ -78,7 +77,7 @@ opal_err2str(int errnum, const char **errmsg)
|
||||
{
|
||||
const char *retval;
|
||||
|
||||
switch (OPAL_SOS_GET_ERROR_CODE(errnum)) {
|
||||
switch (errnum) {
|
||||
case OPAL_SUCCESS:
|
||||
retval = "Success";
|
||||
break;
|
||||
@ -255,9 +254,6 @@ opal_init_util(int* pargc, char*** pargv)
|
||||
/* initialize the help system */
|
||||
opal_show_help_init();
|
||||
|
||||
/* initialize the OPAL SOS system */
|
||||
opal_sos_init();
|
||||
|
||||
/* register handler for errnum -> string converstion */
|
||||
if (OPAL_SUCCESS !=
|
||||
(ret = opal_error_register("OPAL",
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
SUBDIRS = keyval
|
||||
|
||||
dist_pkgdata_DATA = help-opal-util.txt opal_sos_reporter.txt
|
||||
dist_pkgdata_DATA = help-opal-util.txt
|
||||
|
||||
AM_LFLAGS = -Popal_show_help_yy
|
||||
LEX_OUTPUT_ROOT = lex.opal_show_help_yy
|
||||
@ -49,7 +49,6 @@ headers = \
|
||||
opal_environ.h \
|
||||
opal_getcwd.h \
|
||||
opal_pty.h \
|
||||
opal_sos.h \
|
||||
os_dirpath.h \
|
||||
os_path.h \
|
||||
output.h \
|
||||
@ -82,7 +81,6 @@ libopalutil_la_SOURCES = \
|
||||
opal_environ.c \
|
||||
opal_getcwd.c \
|
||||
opal_pty.c \
|
||||
opal_sos.c \
|
||||
os_dirpath.c \
|
||||
os_path.c \
|
||||
output.c \
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2007-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -30,7 +30,6 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/constants.h"
|
||||
|
||||
#define MAX_CONVERTERS 5
|
||||
@ -99,12 +98,12 @@ opal_perror(int errnum, const char *msg)
|
||||
const char* errmsg;
|
||||
ret = opal_strerror_int(errnum, &errmsg);
|
||||
|
||||
if (NULL != msg && OPAL_SOS_GET_ERROR_CODE(errnum) != OPAL_ERR_IN_ERRNO) {
|
||||
if (NULL != msg && errnum != OPAL_ERR_IN_ERRNO) {
|
||||
fprintf(stderr, "%s: ", msg);
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
||||
if (errnum == OPAL_ERR_IN_ERRNO) {
|
||||
perror(msg);
|
||||
} else {
|
||||
char *ue_msg;
|
||||
@ -129,7 +128,7 @@ opal_strerror(int errnum)
|
||||
int ret;
|
||||
const char* errmsg;
|
||||
|
||||
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
||||
if (errnum == OPAL_ERR_IN_ERRNO) {
|
||||
return strerror(errno);
|
||||
}
|
||||
|
||||
@ -156,7 +155,7 @@ opal_strerror_r(int errnum, char *strerrbuf, size_t buflen)
|
||||
|
||||
ret = opal_strerror_int(errnum, &errmsg);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
if (OPAL_SOS_GET_ERROR_CODE(errnum) == OPAL_ERR_IN_ERRNO) {
|
||||
if (errnum == OPAL_ERR_IN_ERRNO) {
|
||||
char *tmp = strerror(errno);
|
||||
strncpy(strerrbuf, tmp, buflen);
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -1,535 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#ifdef HAVE_STDARG_H
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/opal_sos.h"
|
||||
#include "opal/constants.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/util/stacktrace.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
/** Global variables */
|
||||
opal_hash_table_t opal_sos_table;
|
||||
opal_mutex_t opal_sos_table_lock;
|
||||
bool opal_sos_print_low;
|
||||
|
||||
/* Local variables */
|
||||
static bool opal_sos_initialized = false;
|
||||
static const char *dash_line = "--------------------------------------------------------------------------";
|
||||
static const char *stackhdr = "[STACK TRACE]:\n";
|
||||
|
||||
/* Local functions */
|
||||
static void opal_sos_error_construct(opal_sos_error_t *obj);
|
||||
static void opal_sos_error_destruct(opal_sos_error_t *obj);
|
||||
|
||||
/** OPAL SOS callback function pointers */
|
||||
static opal_sos_print_callback_fn_t cur_print_callback;
|
||||
static opal_sos_reporter_callback_fn_t cur_reporter_callback;
|
||||
/* static opal_sos_print_callback_fn_t prev_print_callback; */
|
||||
static opal_sos_reporter_callback_fn_t prev_reporter_callback;
|
||||
|
||||
OBJ_CLASS_INSTANCE(opal_sos_error_t,
|
||||
opal_object_t,
|
||||
opal_sos_error_construct,
|
||||
opal_sos_error_destruct);
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
static void opal_sos_error_construct(opal_sos_error_t *obj)
|
||||
{
|
||||
obj->errnum = 0;
|
||||
obj->file = NULL;
|
||||
obj->line = 0;
|
||||
obj->func = NULL;
|
||||
obj->msg = NULL;
|
||||
obj->prev = obj->next = OPAL_SOS_ERR_BASE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
static void opal_sos_error_destruct(opal_sos_error_t *obj)
|
||||
{
|
||||
if (NULL != obj->file) {
|
||||
free(obj->file);
|
||||
}
|
||||
|
||||
if (NULL != obj->func) {
|
||||
free(obj->func);
|
||||
}
|
||||
|
||||
if (NULL != obj->msg) {
|
||||
free(obj->msg);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the OPAL SOS interface
|
||||
*
|
||||
*/
|
||||
void opal_sos_init(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
if (opal_sos_initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("opal", "sos_print_low",
|
||||
"Set to non-zero to enable the print-at-bottom"
|
||||
" preference for OPAL SOS. Enabling this option prints"
|
||||
" out the errors, warnings or info messages as"
|
||||
" soon as they are encountered.",
|
||||
false, false, (int)false, &value);
|
||||
|
||||
opal_sos_print_low = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
OBJ_CONSTRUCT(&opal_sos_table, opal_hash_table_t);
|
||||
opal_hash_table_init(&opal_sos_table, OPAL_SOS_ERR_TABLE_SIZE);
|
||||
OBJ_CONSTRUCT(&opal_sos_table_lock, opal_mutex_t);
|
||||
|
||||
opal_sos_reg_reporter_callback(opal_sos_print_error, &prev_reporter_callback);
|
||||
opal_sos_initialized = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize the OPAL SOS interface
|
||||
*
|
||||
*/
|
||||
void opal_sos_finalize(void)
|
||||
{
|
||||
OBJ_DESTRUCT(&opal_sos_table);
|
||||
OBJ_DESTRUCT(&opal_sos_table_lock);
|
||||
opal_sos_initialized = false;
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Free all the SOS errors represented by the error code pointed to by \c errnum
|
||||
*
|
||||
*/
|
||||
void opal_sos_free(int *errnum)
|
||||
{
|
||||
opal_sos_error_t *opal_error, *attached_error;
|
||||
int err, attached_errnum;
|
||||
|
||||
if (NULL == errnum) {
|
||||
return;
|
||||
} else if (true == OPAL_SOS_IS_NATIVE(*errnum)) {
|
||||
return;
|
||||
} else {
|
||||
err = *errnum;
|
||||
}
|
||||
|
||||
*errnum = OPAL_SOS_GET_ERROR_CODE(err);
|
||||
|
||||
do {
|
||||
/* Look for attached errors */
|
||||
if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(err))) {
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
|
||||
attached_errnum,
|
||||
(void **)&attached_error)) {
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
|
||||
/* If there's an attached error trace, free it! */
|
||||
if (NULL != attached_error) {
|
||||
attached_errnum = attached_error->errnum;
|
||||
opal_sos_free(&attached_errnum);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
|
||||
OPAL_SOS_GET_INDEX(err),
|
||||
(void **)&opal_error)) {
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
if (NULL == opal_error) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_sos_error_destruct(opal_error);
|
||||
/* Remove the entry from the SOS table */
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
opal_hash_table_remove_value_uint32(&opal_sos_table, OPAL_SOS_GET_INDEX(err));
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
|
||||
err = opal_error->prev;
|
||||
} while (OPAL_SOS_ERR_BASE != err);
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
}
|
||||
|
||||
opal_sos_error_t *
|
||||
opal_sos_build_error(int errnum, bool show_stack, const char *errmsg, ...)
|
||||
{
|
||||
opal_sos_error_t *opal_error;
|
||||
char *stackframe, msg[OPAL_SOS_MAX_ERR_LEN];
|
||||
va_list arglist;
|
||||
int ret_errno = 0, len;
|
||||
|
||||
if (!opal_sos_initialized) {
|
||||
opal_sos_init();
|
||||
}
|
||||
|
||||
opal_error = OBJ_NEW(opal_sos_error_t);
|
||||
if (NULL == opal_error) {
|
||||
return NULL; /* OPAL_ERR_OUT_OF_RESOURCE */
|
||||
}
|
||||
|
||||
va_start(arglist, errmsg);
|
||||
len = vsnprintf(msg, OPAL_SOS_MAX_ERR_LEN, errmsg, arglist);
|
||||
va_end(arglist);
|
||||
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
|
||||
if ((true == show_stack) &&
|
||||
(NULL != (stackframe = opal_stackframe_output_string()))) {
|
||||
len += strlen(stackhdr) + strlen(stackframe) + 2;
|
||||
if (len > OPAL_SOS_MAX_ERR_LEN)
|
||||
len = OPAL_SOS_MAX_ERR_LEN;
|
||||
|
||||
opal_error->msg = (char *) malloc(len);
|
||||
if (NULL == opal_error->msg) {
|
||||
return NULL;
|
||||
}
|
||||
snprintf(opal_error->msg, len, "%s\n%s%s", msg, stackhdr, stackframe);
|
||||
} else {
|
||||
opal_error->msg = strdup(msg);
|
||||
}
|
||||
#else
|
||||
opal_error->msg = strdup ("OPAL_WANT_PRETTY_PRINT_STACKTRACE disabled");
|
||||
#endif
|
||||
|
||||
/* Check if errnum is a native error code and encode it into
|
||||
the encoded error code if it is native */
|
||||
if (OPAL_SOS_IS_NATIVE(errnum)) {
|
||||
OPAL_SOS_SET_ERROR_CODE(ret_errno, errnum);
|
||||
} else {
|
||||
/* Extract the native error code from the encoded error and
|
||||
encode it back again into the newly encoded error code */
|
||||
OPAL_SOS_SET_ERROR_CODE(ret_errno, OPAL_SOS_GET_ERROR_CODE(errnum));
|
||||
opal_error->prev = errnum;
|
||||
}
|
||||
|
||||
opal_error->errnum = ret_errno;
|
||||
return opal_error;
|
||||
}
|
||||
|
||||
int opal_sos_reporter(const char *file, int line, const char *func,
|
||||
opal_sos_severity_t severity, opal_sos_error_t *opal_error)
|
||||
{
|
||||
opal_sos_error_t *prev_error;
|
||||
int ret_errno = 0, hash;
|
||||
|
||||
if (NULL == opal_error) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* Doing more strict validation here since if either of the file,
|
||||
* func or msg are not known we replace it by <unknown> to avoid any issues
|
||||
* during dss pack/unpack
|
||||
*/
|
||||
opal_error->file = (NULL != file)?strdup(file):strdup("<unknown>");
|
||||
opal_error->func = (NULL != func)?strdup(func):strdup("<unknown>");
|
||||
opal_error->line = line;
|
||||
|
||||
ret_errno = opal_error->errnum;
|
||||
/* Encode the severity level into the return error code */
|
||||
OPAL_SOS_SET_SEVERITY(ret_errno, severity);
|
||||
hash = opal_sos_hash_error(opal_error);
|
||||
OPAL_SOS_SET_INDEX(ret_errno, hash);
|
||||
opal_error->errnum = ret_errno;
|
||||
|
||||
if (opal_sos_print_low) {
|
||||
opal_sos_report_error(opal_error);
|
||||
}
|
||||
|
||||
/* Add the error object to the error table */
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_set_value_uint32(&opal_sos_table,
|
||||
OPAL_SOS_GET_INDEX(ret_errno),
|
||||
(void *)opal_error)) {
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
OBJ_DESTRUCT(opal_error);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* Get the previous error in the error call stack and update
|
||||
its next error pointer */
|
||||
prev_error = NULL;
|
||||
opal_hash_table_get_value_uint32(&opal_sos_table,
|
||||
OPAL_SOS_GET_INDEX(opal_error->prev),
|
||||
(void **)&prev_error);
|
||||
if (NULL != prev_error) {
|
||||
prev_error->next = opal_error->errnum;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
|
||||
return ret_errno;
|
||||
}
|
||||
|
||||
void
|
||||
opal_sos_report_error(opal_sos_error_t *error)
|
||||
{
|
||||
opal_sos_severity_t severity;
|
||||
char *pretty_error;
|
||||
int errnum, ret;
|
||||
|
||||
if (NULL == error)
|
||||
return;
|
||||
|
||||
severity = (opal_sos_severity_t)OPAL_SOS_GET_SEVERITY(error->errnum);
|
||||
|
||||
/* An OPAL SOS encoded error number holds no meaning outside
|
||||
* the context of Open MPI. We convert it back to the native
|
||||
* error code before reporting it. */
|
||||
if (true == OPAL_SOS_IS_NATIVE(error->errnum)) {
|
||||
errnum = error->errnum;
|
||||
} else {
|
||||
errnum = OPAL_SOS_GET_ERROR_CODE(error->errnum);
|
||||
}
|
||||
|
||||
/* Prettify the error for printing it locally */
|
||||
ret = opal_sos_prettify_error(error->msg, &pretty_error);
|
||||
|
||||
(*cur_reporter_callback)(severity, errnum, "<%s> at %s:%d:%s():\n%s",
|
||||
opal_sos_severity2str(severity), error->file,
|
||||
error->line, error->func,
|
||||
((0 > ret) ? error->msg : pretty_error));
|
||||
|
||||
if (ret > 0) {
|
||||
free(pretty_error);
|
||||
}
|
||||
|
||||
/* Call the previous reporter callback which should be the selected
|
||||
* ORTE notifier components */
|
||||
if (NULL != prev_reporter_callback) {
|
||||
prev_reporter_callback(severity, errnum, "<%s> at %s:%d:%s():\n%s",
|
||||
opal_sos_severity2str(severity), error->file,
|
||||
error->line, error->func, error->msg);
|
||||
}
|
||||
}
|
||||
|
||||
void opal_sos_print(int errnum, bool show_history)
|
||||
{
|
||||
opal_sos_error_t *opal_error, *prev_opal_error, *attached_error;
|
||||
int tmp, attached_errnum, prev_severity, severity;
|
||||
|
||||
opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line);
|
||||
tmp = errnum;
|
||||
prev_opal_error = NULL;
|
||||
do {
|
||||
/* If there is an error attached to this error, print it out. */
|
||||
if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(errnum))) {
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
|
||||
attached_errnum,
|
||||
(void **)&attached_error)) {
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
|
||||
if (NULL != attached_error) {
|
||||
opal_sos_print(attached_error->errnum, show_history);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&opal_sos_table_lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_get_value_uint32(&opal_sos_table,
|
||||
OPAL_SOS_GET_INDEX(errnum),
|
||||
(void **)&opal_error)) {
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
if (NULL == opal_error) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != prev_opal_error) {
|
||||
prev_severity = OPAL_SOS_GET_SEVERITY(prev_opal_error->errnum);
|
||||
severity = OPAL_SOS_GET_SEVERITY(errnum);
|
||||
|
||||
/* If show_history is enabled, or if the preceeding error
|
||||
was of higher severity, then report the error */
|
||||
if (show_history || (prev_severity <= severity))
|
||||
/* Print the error denoted by errnum. */
|
||||
opal_sos_report_error(prev_opal_error);
|
||||
}
|
||||
|
||||
prev_opal_error = opal_error;
|
||||
/* Get the previous error */
|
||||
errnum = opal_error->prev;
|
||||
/* Terminating condition */
|
||||
if (OPAL_SOS_ERR_BASE == errnum) {
|
||||
opal_sos_report_error(opal_error);
|
||||
}
|
||||
} while (errnum != OPAL_SOS_ERR_BASE);
|
||||
opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line);
|
||||
errnum = tmp;
|
||||
return;
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
|
||||
}
|
||||
|
||||
void opal_sos_print_error(opal_sos_severity_t severity, int errnum, const char *errmsg, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
va_start(arglist, errmsg);
|
||||
opal_show_vhelp("opal_sos_reporter.txt", "general message", false, arglist);
|
||||
va_end(arglist);
|
||||
}
|
||||
|
||||
void opal_sos_log(int errnum)
|
||||
{
|
||||
opal_sos_print(errnum, false);
|
||||
opal_sos_free(&errnum);
|
||||
}
|
||||
|
||||
int opal_sos_prettify_error(const char *error, char **pretty_error)
|
||||
{
|
||||
char *str, *token, *saveptr, *errdup;
|
||||
const char *prefix = "\n| | ";
|
||||
int len = 0, plen, left;
|
||||
|
||||
if (NULL == error) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
*pretty_error = (char *) malloc(OPAL_SOS_MAX_ERR_LEN);
|
||||
if (NULL == *pretty_error) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
*(*pretty_error) = '\0';
|
||||
|
||||
plen = strlen(prefix);
|
||||
|
||||
if (NULL != (errdup = strdup(error))) {
|
||||
for (str = errdup, len = 0; len < OPAL_SOS_MAX_ERR_LEN; str = NULL) {
|
||||
if (NULL == (token = strtok_r(str, "\n", &saveptr))) {
|
||||
break;
|
||||
}
|
||||
|
||||
left = strlen(token);
|
||||
if ((len + left) > OPAL_SOS_MAX_ERR_LEN) {
|
||||
left = OPAL_SOS_MAX_ERR_LEN - len;
|
||||
}
|
||||
strncat(*pretty_error, token, left);
|
||||
len += left;
|
||||
|
||||
left = plen;
|
||||
if ((len + left) > OPAL_SOS_MAX_ERR_LEN) {
|
||||
left = OPAL_SOS_MAX_ERR_LEN - len;
|
||||
}
|
||||
strncat(*pretty_error, prefix, left);
|
||||
len += left;
|
||||
}
|
||||
free(errdup);
|
||||
errdup = NULL;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
const char *opal_sos_severity2str(opal_sos_severity_t severity)
|
||||
{
|
||||
switch(severity) {
|
||||
case OPAL_SOS_SEVERITY_EMERG: return "EMERGENCY";
|
||||
case OPAL_SOS_SEVERITY_ALERT: return "ALERT MESSAGE";
|
||||
case OPAL_SOS_SEVERITY_CRIT: return "CRITICAL MESSAGE";
|
||||
case OPAL_SOS_SEVERITY_ERROR: return "ERROR";
|
||||
case OPAL_SOS_SEVERITY_WARN: return "WARNING";
|
||||
case OPAL_SOS_SEVERITY_NOTICE: return "NOTICE";
|
||||
case OPAL_SOS_SEVERITY_INFO: return "INFO MESSAGE";
|
||||
case OPAL_SOS_SEVERITY_DEBUG: return "DEBUG MESSAGE";
|
||||
default: return "UNKNOWN ERROR";
|
||||
}
|
||||
}
|
||||
|
||||
int opal_sos_hash_error(opal_sos_error_t *error)
|
||||
{
|
||||
int hash, c;
|
||||
char *msg;
|
||||
|
||||
/* Naive string hash function to create a key based on the error
|
||||
details, namely length of the file name, length of the function
|
||||
name and the sum of the characters in the error message */
|
||||
|
||||
hash = error->errnum;
|
||||
if (NULL != error->file) {
|
||||
hash += strlen(error->file);
|
||||
}
|
||||
if (NULL != error->func) {
|
||||
hash += strlen(error->func);
|
||||
}
|
||||
if (NULL != error->msg) {
|
||||
msg = error->msg;
|
||||
while ('\0' != (c = *msg++)) {
|
||||
hash += c;
|
||||
}
|
||||
}
|
||||
|
||||
return (hash & (OPAL_SOS_ERR_TABLE_SIZE - 1));
|
||||
}
|
||||
|
||||
int opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func,
|
||||
opal_sos_print_callback_fn_t *prev_func)
|
||||
{
|
||||
/* Preserve the previous print callback */
|
||||
*prev_func = cur_print_callback;
|
||||
|
||||
/* Update the current print callback */
|
||||
cur_print_callback = new_func;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func,
|
||||
opal_sos_reporter_callback_fn_t *prev_func)
|
||||
{
|
||||
/* Preserve the previous reporter callback */
|
||||
*prev_func = cur_reporter_callback;
|
||||
|
||||
/* Update the current reporter callback */
|
||||
cur_reporter_callback = new_func;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
@ -1,441 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OPAL_SOS_H
|
||||
#define OPAL_SOS_H
|
||||
|
||||
#ifdef HAVE_LIMITS_H
|
||||
#include <limits.h>
|
||||
#endif
|
||||
#ifdef HAVE_SYSLOG_H
|
||||
#include <syslog.h>
|
||||
#endif
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#ifdef __STDC_VERSION__
|
||||
# if __STDC_VERSION__ < 199901L
|
||||
# if defined(__GNUC__) && __GNUC__ >= 2
|
||||
# define OPAL_SOS_FUNCTION __FUNCTION__
|
||||
# else
|
||||
# define OPAL_SOS_FUNCTION "<unknown>"
|
||||
# endif
|
||||
# else
|
||||
# define OPAL_SOS_FUNCTION __func__
|
||||
# endif
|
||||
#else
|
||||
# define OPAL_SOS_FUNCTION __func__
|
||||
#endif
|
||||
|
||||
/* Internal use only */
|
||||
#define OPAL_SOS_ERR_BASE OPAL_SUCCESS
|
||||
|
||||
/**
|
||||
* Size of the OPAL SOS error table.
|
||||
*
|
||||
* Since the index into the error table that is encoded in the error
|
||||
* code is 9-bit long, setting a higher value than (1 << 9) would make
|
||||
* no difference at all.
|
||||
*/
|
||||
#define OPAL_SOS_ERR_TABLE_SIZE 512
|
||||
|
||||
/**
|
||||
* Maximum length for the error string stored per error code in the
|
||||
* OPAL SOS error table.
|
||||
*/
|
||||
#define OPAL_SOS_MAX_ERR_LEN 1024
|
||||
|
||||
/**
|
||||
* Reports an error to OPAL SOS reporter.
|
||||
*
|
||||
* Encodes an informational message with severity \c severity and
|
||||
* other passed arguments like errnum, errmsg etc. It also remembers
|
||||
* the line number, file name and the function name where the error
|
||||
* has occurred.
|
||||
* If the MCA parameter \c opal_sos_print_low is set, the error message
|
||||
* is displayed on stderr using the "show help" subsystem. By default,
|
||||
* informational messages are not printed out on stderr.
|
||||
* If \c show_stack is set, the stacktrace is saved and/or printed
|
||||
* along with the corresponding \c errmsg.
|
||||
*/
|
||||
#define OPAL_SOS_REPORT(severity, arg) opal_sos_reporter(__FILE__, __LINE__, \
|
||||
OPAL_SOS_FUNCTION, \
|
||||
severity, \
|
||||
opal_sos_build_error arg)
|
||||
|
||||
/**
|
||||
* Print or store an event with the maximum severity (EMERG).
|
||||
*/
|
||||
#define OPAL_SOS_EMERG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_EMERG, arg)
|
||||
|
||||
/**
|
||||
* Report an event of severity "ALERT".
|
||||
*/
|
||||
#define OPAL_SOS_ALERT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ALERT, arg)
|
||||
|
||||
/**
|
||||
* Report events with severity marked as "CRITICAL".
|
||||
*/
|
||||
#define OPAL_SOS_CRIT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_CRIT, arg)
|
||||
|
||||
/**
|
||||
* Prints and/or logs an error.
|
||||
* This function can be used to log or print error events.
|
||||
*/
|
||||
#define OPAL_SOS_ERROR(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ERROR, arg)
|
||||
|
||||
/**
|
||||
* Prints and/or logs a warning.
|
||||
*
|
||||
* This function is similar to OPAL_SOS_INFO but with a higher
|
||||
* severity. These events are printed out on the output stream
|
||||
* by default.
|
||||
*/
|
||||
#define OPAL_SOS_WARN(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_WARN, arg)
|
||||
|
||||
/**
|
||||
* Report an error event with severity "NOTICE".
|
||||
*/
|
||||
#define OPAL_SOS_NOTICE(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_NOTICE,arg)
|
||||
|
||||
/**
|
||||
* Prints or logs an informational message in the OPAL SOS framework.
|
||||
* Events with this severity are not printed, by default. However,
|
||||
* they are still stored in the SOS table.
|
||||
*/
|
||||
#define OPAL_SOS_INFO(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_INFO, arg)
|
||||
|
||||
/**
|
||||
* Log debug events in the SOS framework.
|
||||
*/
|
||||
#define OPAL_SOS_DEBUG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_DEBUG, arg)
|
||||
|
||||
/**
|
||||
* Frees all the (entire stack of) OPAL SOS error objects associated
|
||||
* with the encoded error code obtained after dereferencing the
|
||||
* pointer \c errnum.
|
||||
*/
|
||||
#define OPAL_SOS_FREE(perrnum) opal_sos_free(perrnum)
|
||||
|
||||
/**
|
||||
* Print the warnings/errors/informational messages previously logged
|
||||
* in to the SOS framework.
|
||||
*
|
||||
* This function prints the error details encoded by \c errnum.
|
||||
* If \c show_history is true, the entire history for the error
|
||||
* represented by \c errnum is printed on the output stream.
|
||||
*/
|
||||
#define OPAL_SOS_PRINT(errnum, show_history) \
|
||||
opal_sos_print(errnum, show_history)
|
||||
|
||||
/**
|
||||
* Attach the history from one error code to another error code
|
||||
* Returns the target encoded error \c errtgt with history of \c
|
||||
* errnum associated to it.
|
||||
*/
|
||||
#define OPAL_SOS_ATTACH(errtgt, errnum) \
|
||||
(errtgt = -((-errtgt & ~0xFF80000L) | \
|
||||
((OPAL_SOS_GET_INDEX(errnum) & 0x1FFL) * 0x80000L)))
|
||||
|
||||
/**
|
||||
* Returns the index of the error attached to errnum using OPAL_SOS_ATTACH().
|
||||
*/
|
||||
#define OPAL_SOS_GET_ATTACHED_INDEX(errnum) ((int) ((-errnum & 0xFF80000L) >> 19))
|
||||
|
||||
/**
|
||||
* Returns the native error code for the given encoded error code \c
|
||||
* errnum. \c errnum can be a native error code itself.
|
||||
*/
|
||||
#define OPAL_SOS_GET_ERROR_CODE(errnum) \
|
||||
((errnum >= 0) ? errnum : (int) -(-errnum & 0x3FFL))
|
||||
|
||||
/**
|
||||
* Sets the native error code for the potentially encoded error code.
|
||||
*
|
||||
* The lower 10 bits are reserved for the native error code. This
|
||||
* macro sets the lower 10 bits of errnum to nativeerr.
|
||||
*/
|
||||
#define OPAL_SOS_SET_ERROR_CODE(errnum, nativeerr) \
|
||||
(errnum = -((-errnum & ~0x3FFL) | (-nativeerr & 0x3FFL)))
|
||||
|
||||
/**
|
||||
* Macro to check if the error encoded by \c errnum is a native error
|
||||
* or an OPAL SOS encoded error.
|
||||
*/
|
||||
#define OPAL_SOS_IS_NATIVE(errnum) ((-errnum & ~0x3FFL) == 0)
|
||||
|
||||
/**
|
||||
* Returns the severity level for the potentially encoded error code.
|
||||
*
|
||||
* The severity is encoded in the last three bits of the first nibble.
|
||||
*/
|
||||
#define OPAL_SOS_GET_SEVERITY(errnum) ((int)((-errnum >> 28) & 0x7L))
|
||||
|
||||
/**
|
||||
* Sets the severity level for the given error code \c errnum.
|
||||
*
|
||||
* This macros do not do strict error checking of the specified
|
||||
* severity levels.
|
||||
*/
|
||||
#define OPAL_SOS_SET_SEVERITY(errnum, severity) \
|
||||
(errnum = -((-errnum & ~0x70000000L) | ((severity & 0x7L) * 0x10000000L)))
|
||||
|
||||
/**
|
||||
* Macro to get the encoded error severity level as a string.
|
||||
*
|
||||
* This macro accepts the argument \c severity and calls the corresponding
|
||||
* function opal_sos_severity2str to convert it to a string. The result
|
||||
* is returned in a static buffer that should not be freed with free().
|
||||
*/
|
||||
#define OPAL_SOS_SEVERITY2STR(severity) opal_sos_severity2str(severity)
|
||||
|
||||
/**
|
||||
* Log an encoded error \c errnum.
|
||||
*
|
||||
* This macro prints out and consequently frees the entire stack of
|
||||
* errors associated with the \c errnum.
|
||||
*/
|
||||
#define OPAL_SOS_LOG(errnum) opal_sos_log(errnum)
|
||||
|
||||
/**
|
||||
* \internal
|
||||
* Returns the index into the error table of the error encoded by \c errnum.
|
||||
*
|
||||
* The index is 9-bit long stored from bit 11 to bit 20 in the encoded
|
||||
* error code.
|
||||
*/
|
||||
#define OPAL_SOS_GET_INDEX(errnum) ((int)((-errnum & 0x7FC00L) >> 10))
|
||||
|
||||
/**
|
||||
* \internal
|
||||
* Sets the index into the error table for the error encoded by \c errnum.
|
||||
*/
|
||||
#define OPAL_SOS_SET_INDEX(errnum, index) \
|
||||
(errnum = -((-errnum & ~0x7FC00L) | ((index & 0x1FFL) * 0x400L)))
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/** This MCA parameter sos_print_low can be set to non-zero to enable
|
||||
* the print-at-bottom preference for OPAL SOS. */
|
||||
OPAL_DECLSPEC extern bool opal_sos_print_low;
|
||||
|
||||
/* Severity levels for OPAL SOS */
|
||||
typedef enum {
|
||||
OPAL_SOS_SEVERITY_EMERG = LOG_EMERG,
|
||||
OPAL_SOS_SEVERITY_ALERT = LOG_ALERT,
|
||||
OPAL_SOS_SEVERITY_CRIT = LOG_CRIT,
|
||||
OPAL_SOS_SEVERITY_ERROR = LOG_ERR,
|
||||
OPAL_SOS_SEVERITY_WARN = LOG_WARNING,
|
||||
OPAL_SOS_SEVERITY_NOTICE = LOG_NOTICE,
|
||||
OPAL_SOS_SEVERITY_INFO = LOG_INFO,
|
||||
OPAL_SOS_SEVERITY_DEBUG = LOG_DEBUG
|
||||
} opal_sos_severity_t;
|
||||
|
||||
typedef struct opal_sos_error_t {
|
||||
/** Class parent */
|
||||
opal_object_t super;
|
||||
|
||||
/**
|
||||
* The encoded error code for a given type of error.
|
||||
*
|
||||
* errnum encodes a native error code (lower 10 bits) with the
|
||||
* current severity (higher 2 bits) and an index into the error
|
||||
* table along with the associated error, if there is one.
|
||||
*/
|
||||
int errnum;
|
||||
|
||||
/** File in which the error occured */
|
||||
char *file;
|
||||
|
||||
/** Line number on which the error was encountered */
|
||||
int line;
|
||||
|
||||
/** This is an optional parameter that indicates the function in
|
||||
which the error occured */
|
||||
char *func;
|
||||
|
||||
/** The actual error message or string for the error indicated by
|
||||
\c errnum */
|
||||
char *msg;
|
||||
|
||||
/** Encoded error numbers of the previous and the next error.
|
||||
These are used are used to maintain the history of an error.
|
||||
The complete history of an error can be printed later using
|
||||
OPAL_SOS_PRINT() */
|
||||
int prev;
|
||||
int next;
|
||||
} opal_sos_error_t;
|
||||
|
||||
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_sos_error_t);
|
||||
|
||||
/**
|
||||
* Signature for OPAL SOS print function callback type.
|
||||
*/
|
||||
typedef void (*opal_sos_print_callback_fn_t) (int errcode);
|
||||
|
||||
/**
|
||||
* Signature for OPAL SOS reporter function callback type.
|
||||
*/
|
||||
typedef void (*opal_sos_reporter_callback_fn_t) (opal_sos_severity_t severity, int errcode,
|
||||
const char *msg, ...)
|
||||
__opal_attribute_format_funcptr__(__printf__, 3, 4);
|
||||
|
||||
/**
|
||||
* A global handle that points to the local OPAL SOS table.
|
||||
* This is used by the notifier components to reference the local OPAL
|
||||
* SOS table, especially for packing/unpacking and sending it over to
|
||||
* the HNP.
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_hash_table_t opal_sos_table;
|
||||
|
||||
/**
|
||||
* A global handle that points to the OPAL SOS table lock.
|
||||
*
|
||||
*/
|
||||
OPAL_DECLSPEC extern opal_mutex_t opal_sos_table_lock;
|
||||
|
||||
/**
|
||||
* \internal
|
||||
*
|
||||
* Initialize OPAL SOS.
|
||||
*
|
||||
* This function initializes and sets up the structures required to
|
||||
* track the data handled by OPAL SOS. It is invoked by
|
||||
* opal_util().
|
||||
*/
|
||||
void opal_sos_init(void);
|
||||
|
||||
/**
|
||||
* \internal
|
||||
*
|
||||
* Shut down OPAL SOS.
|
||||
*
|
||||
* Invoked by opal_finalize() to deallocate the structures needed by
|
||||
* OPAL SOS.
|
||||
*/
|
||||
void opal_sos_finalize(void);
|
||||
|
||||
/**
|
||||
* Prints or relays the error locally or using the selected notifier
|
||||
* components.
|
||||
*/
|
||||
void
|
||||
opal_sos_report_error(opal_sos_error_t *error);
|
||||
|
||||
/**
|
||||
* Builds an OPAL SOS error object given the parameters errnum,
|
||||
* show_stack and errmsg.
|
||||
* NOTE: This function only partially populates the SOS error object
|
||||
* structure, setting the error message details but nothing about where
|
||||
* the error occurred. Filling up the rest of the error object is left
|
||||
* to OPAL SOS reporter which then handles the error appropriately.
|
||||
*
|
||||
* @param errnum
|
||||
* @param show_stack
|
||||
* @param errmsg
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
OPAL_DECLSPEC opal_sos_error_t *
|
||||
opal_sos_build_error(int errnum, bool show_stack,
|
||||
const char *errmsg, ...)
|
||||
__opal_attribute_format_funcptr__(__printf__, 3, 4);
|
||||
|
||||
/**
|
||||
* OPAL SOS reporter logs the error in the OPAL SOS error table or
|
||||
* prints it out depending on the associated reporter callback. It can
|
||||
* also relay the error messages to the selected notifier components
|
||||
* using the OPAL SOS reporter callback interface.
|
||||
*
|
||||
* @param file
|
||||
* @param line
|
||||
* @param func
|
||||
* @param opal_error
|
||||
*
|
||||
* @return encoded error code
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_sos_reporter(const char *file, int line, const char *func,
|
||||
opal_sos_severity_t severity,
|
||||
opal_sos_error_t *opal_error);
|
||||
|
||||
/**
|
||||
* Prints the error encoded by the error number \c errnum
|
||||
*
|
||||
* @param errnum
|
||||
* @param show_history
|
||||
*
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_sos_print(int errnum, bool show_history);
|
||||
|
||||
OPAL_DECLSPEC int opal_sos_prettify_error(const char *error, char **pretty_error);
|
||||
|
||||
/**
|
||||
* Prints a single error represented by the OPAL SOS error object
|
||||
* opal_sos_error_t.
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_sos_print_error(opal_sos_severity_t severity,
|
||||
int errnum, const char *errmsg, ...)
|
||||
__opal_attribute_format_funcptr__(__printf__, 3, 4);
|
||||
|
||||
/**
|
||||
* Frees the error object represented by the error code \c errnum.
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_sos_free(int *errnum);
|
||||
|
||||
/**
|
||||
* Logs (prints and frees) the error object represented by \c errnum.
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_sos_log(int errnum);
|
||||
|
||||
/**
|
||||
* Returns the OPAL SOS severity level as a string.
|
||||
*
|
||||
*/
|
||||
const char *opal_sos_severity2str(opal_sos_severity_t severity);
|
||||
|
||||
/**
|
||||
* \internal
|
||||
* Return a unique key into the hash table (opal_sos_error_table)
|
||||
* depending on the type and location of the error.
|
||||
*
|
||||
*/
|
||||
int opal_sos_hash_error(opal_sos_error_t *error);
|
||||
|
||||
/**
|
||||
* Registers a print callback function for OPAL_SOS_PRINT()
|
||||
*/
|
||||
OPAL_DECLSPEC int
|
||||
opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func,
|
||||
opal_sos_print_callback_fn_t *prev_func);
|
||||
|
||||
/**
|
||||
* Registers a reporter callback function for OPAL_SOS_INFO(),
|
||||
* OPAL_SOS_WARN() and OPAL_SOS_ERROR()
|
||||
*/
|
||||
OPAL_DECLSPEC int
|
||||
opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func,
|
||||
opal_sos_reporter_callback_fn_t *prev_func);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* OPAL_SOS_H */
|
@ -519,9 +519,7 @@ int opal_util_register_stackhandlers (void)
|
||||
if (!showed_help && complain) {
|
||||
/* JMS This is icky; there is no error message
|
||||
aggregation here so this message may be repeated for
|
||||
every single MPI process... This should be replaced
|
||||
with OPAL_SOS when that is done so that it can be
|
||||
properly aggregated. */
|
||||
every single MPI process... */
|
||||
opal_show_help("help-opal-util.txt",
|
||||
"stacktrace signal override",
|
||||
true, sig, sig, sig, string_value);
|
||||
|
@ -63,7 +63,6 @@ include tools/Makefile.am
|
||||
include orted/Makefile.am
|
||||
include test/mpi/Makefile.include
|
||||
include test/system/Makefile.include
|
||||
include threads/Makefile.am
|
||||
|
||||
# Set the convenience library to be the same as the non-convenience
|
||||
# library, but a) it's marked as "noinst", so LT knows it's a
|
||||
|
@ -13,7 +13,7 @@ dnl All rights reserved.
|
||||
dnl Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
dnl Copyright (c) 2009-2012 Los Alamos National Security, LLC. All rights
|
||||
dnl reserved.
|
||||
dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
dnl
|
||||
@ -114,25 +114,22 @@ AC_DEFINE_UNQUOTED([ORTE_ENABLE_HEARTBEAT],
|
||||
[Whether we want daemon heartbeat monitoring enabled])
|
||||
|
||||
#
|
||||
# Compile in resilient runtime code
|
||||
#
|
||||
AC_MSG_CHECKING([if want resilient runtime code enabled])
|
||||
AC_ARG_ENABLE(resilient-orte,
|
||||
[AC_HELP_STRING([--enable-resilient-orte], [Enable the resilient runtime code.])])
|
||||
if test "$enable_resilient_orte" = "yes"; then
|
||||
# Do we want a separate orte progress thread?
|
||||
AC_MSG_CHECKING([if want orte progress thread])
|
||||
AC_ARG_ENABLE([orte-progress-thread],
|
||||
[AC_HELP_STRING([--enable-orte-progress-thread],
|
||||
[Enable orte progress thread - for experiment by developers only! (default: disabled)])])
|
||||
if test "$enable_orte_progress_thread" = "yes"; then
|
||||
AC_MSG_RESULT([yes])
|
||||
orte_enable_resilient_code=1
|
||||
orte_enable_progress_thread=1
|
||||
AC_DEFINE_UNQUOTED(OPAL_EVENT_HAVE_THREAD_SUPPORT, 1,
|
||||
[Thread support must be configured into the event library])
|
||||
else
|
||||
AC_MSG_RESULT([no])
|
||||
orte_enable_resilient_code=0
|
||||
orte_enable_progress_thread=0
|
||||
fi
|
||||
AM_CONDITIONAL(ORTE_RESIL_ORTE, [test "$enable_resilient_orte" = "yes"])
|
||||
AC_DEFINE_UNQUOTED([ORTE_RESIL_ORTE], [$orte_enable_resilient_code],
|
||||
[Compile a resilient version of Open MPI])
|
||||
|
||||
AM_CONDITIONAL(ORTE_ENABLE_EPOCH, [test "$enable_resilient_orte" = "yes"])
|
||||
AC_DEFINE_UNQUOTED([ORTE_ENABLE_EPOCH], [$orte_enable_resilient_code],
|
||||
[Support for epoch in the ORTE process name enabled or not])
|
||||
|
||||
AC_DEFINE_UNQUOTED([ORTE_ENABLE_PROGRESS_THREAD],
|
||||
[$orte_enable_progress_thread],
|
||||
[Whether we want an orte progress thread enabled])
|
||||
|
||||
])dnl
|
||||
|
@ -82,54 +82,27 @@ typedef uint32_t orte_vpid_t;
|
||||
#define ORTE_VPID_MAX UINT32_MAX-2
|
||||
#define ORTE_VPID_MIN 0
|
||||
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
typedef uint32_t orte_epoch_t;
|
||||
#define ORTE_EPOCH_T OPAL_UINT32
|
||||
#define ORTE_EPOCH_MAX UINT32_MAX-2
|
||||
#define ORTE_EPOCH_MIN 0
|
||||
#endif
|
||||
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
#define ORTE_PROCESS_NAME_HTON(n) \
|
||||
do { \
|
||||
n.jobid = htonl(n.jobid); \
|
||||
n.vpid = htonl(n.vpid); \
|
||||
n.epoch = htonl(n.epoch); \
|
||||
} while (0)
|
||||
#else
|
||||
#define ORTE_PROCESS_NAME_HTON(n) \
|
||||
do { \
|
||||
n.jobid = htonl(n.jobid); \
|
||||
n.vpid = htonl(n.vpid); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
#define ORTE_PROCESS_NAME_NTOH(n) \
|
||||
do { \
|
||||
n.jobid = ntohl(n.jobid); \
|
||||
n.vpid = ntohl(n.vpid); \
|
||||
n.epoch = ntohl(n.epoch); \
|
||||
} while (0)
|
||||
#else
|
||||
#define ORTE_PROCESS_NAME_NTOH(n) \
|
||||
do { \
|
||||
n.jobid = ntohl(n.jobid); \
|
||||
n.vpid = ntohl(n.vpid); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define ORTE_NAME_ARGS(n) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_JOBID_INVALID : (unsigned long)(n)->jobid), \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_VPID_INVALID : (unsigned long)(n)->vpid) \
|
||||
(unsigned long) ((NULL == n) ? (unsigned long)ORTE_EPOCH_INVALID : (unsigned long)(n)->epoch)
|
||||
|
||||
/*
|
||||
* define invalid values
|
||||
*/
|
||||
#define ORTE_JOBID_INVALID (ORTE_JOBID_MAX + 2)
|
||||
#define ORTE_VPID_INVALID (ORTE_VPID_MAX + 2)
|
||||
#define ORTE_EPOCH_INVALID (ORTE_EPOCH_MAX + 2)
|
||||
#define ORTE_LOCAL_JOBID_INVALID (ORTE_JOBID_INVALID & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -137,7 +110,6 @@ do { \
|
||||
*/
|
||||
#define ORTE_JOBID_WILDCARD (ORTE_JOBID_MAX + 1)
|
||||
#define ORTE_VPID_WILDCARD (ORTE_VPID_MAX + 1)
|
||||
#define ORTE_EPOCH_WILDCARD (ORTE_EPOCH_MAX + 1)
|
||||
#define ORTE_LOCAL_JOBID_WILDCARD (ORTE_JOBID_WILDCARD & 0x0000FFFF)
|
||||
|
||||
/*
|
||||
@ -146,16 +118,6 @@ do { \
|
||||
struct orte_process_name_t {
|
||||
orte_jobid_t jobid; /**< Job number */
|
||||
orte_vpid_t vpid; /**< Process id - equivalent to rank */
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
orte_epoch_t epoch; /**< Epoch - used to measure the generation of a recovered process.
|
||||
* The epoch will start at ORTE_EPOCH_MIN and
|
||||
* increment every time the process is detected as
|
||||
* having stopped (including normal shutdown). The
|
||||
* HNP will be responsible for informing all
|
||||
* processes that did not directly detect the
|
||||
* failure to increment their epochs.
|
||||
*/
|
||||
#endif
|
||||
};
|
||||
typedef struct orte_process_name_t orte_process_name_t;
|
||||
|
||||
@ -179,10 +141,6 @@ typedef void* orte_iov_base_ptr_t;
|
||||
#define ORTE_VPID (OPAL_DSS_ID_DYNAMIC + 3) /**< a vpid */
|
||||
#define ORTE_JOBID (OPAL_DSS_ID_DYNAMIC + 4) /**< a jobid */
|
||||
|
||||
#if ORTE_ENABLE_EPOCH
|
||||
#define ORTE_EPOCH (OPAL_DSS_ID_DYNAMIC + 5) /**< an epoch */
|
||||
#endif
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* State-related types */
|
||||
#define ORTE_NODE_STATE (OPAL_DSS_ID_DYNAMIC + 6) /**< node status flag */
|
||||
@ -205,11 +163,8 @@ typedef void* orte_iov_base_ptr_t;
|
||||
/* DAEMON command type */
|
||||
#define ORTE_DAEMON_CMD (OPAL_DSS_ID_DYNAMIC + 19) /**< command flag for communicating with the daemon */
|
||||
|
||||
/* GRPCOMM types */
|
||||
#define ORTE_GRPCOMM_MODE (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
|
||||
/* IOF types */
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 21)
|
||||
#define ORTE_IOF_TAG (OPAL_DSS_ID_DYNAMIC + 20)
|
||||
|
||||
|
||||
/* provide a boundary for others to use */
|
||||
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
sources = \
|
||||
errmgr_app.h \
|
||||
errmgr_app_component.c \
|
||||
errmgr_app.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_app_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_app.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_app.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_app_la_SOURCES = $(sources)
|
||||
mca_errmgr_app_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_app_la_SOURCES =$(sources)
|
||||
libmca_errmgr_app_la_LDFLAGS = -module -avoid-version
|
@ -1,280 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_app.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_app_abort_peers,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
#if ORTE_RESIL_ORTE
|
||||
,orte_errmgr_base_set_fault_callback
|
||||
#endif
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
#if ORTE_RESIL_ORTE
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE,
|
||||
ORTE_RML_PERSISTENT,
|
||||
epoch_change_recv,
|
||||
NULL);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
#if ORTE_RESIL_ORTE
|
||||
orte_rml.recv_cancel(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE);
|
||||
#endif
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* see is this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
#if ORTE_RESIL_ORTE
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata) {
|
||||
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
|
||||
}
|
||||
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data) {
|
||||
orte_message_event_t *mev = (orte_message_event_t *) data;
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
orte_process_name_t *proc;
|
||||
int n = 1, ret, num_dead, i;
|
||||
opal_pointer_array_t *procs;
|
||||
|
||||
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Received epoch change notification",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
procs = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
|
||||
for (i = 0; i < num_dead; i++) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
proc[i].epoch++;
|
||||
orte_util_set_epoch(&proc[i], proc[i].epoch);
|
||||
|
||||
opal_pointer_array_add(procs, &proc[i]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Epoch for %s updated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc[i])));
|
||||
}
|
||||
|
||||
if (NULL != fault_cbfunc && 0 < num_dead) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
(*fault_cbfunc)(procs);
|
||||
} else if (NULL == fault_cbfunc) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback failed (NULL pointer)!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback failed (num_dead <= 0)!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
free(proc);
|
||||
OBJ_RELEASE(procs);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t buffer;
|
||||
orte_std_cntr_t i;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
|
||||
|
||||
/*
|
||||
* Pack up the list of processes and send them to the HNP
|
||||
*/
|
||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* pack number of processes */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Pack the list of names */
|
||||
for( i = 0; i < num_procs; ++i ) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* Send to HNP for termination */
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
return exit_status;
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_app_EXPORT_H
|
||||
#define MCA_ERRMGR_app_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_app_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_app_EXPORT_H */
|
@ -1,89 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_app.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_app_component_version_string =
|
||||
"ORTE ERRMGR app MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_app_open(void);
|
||||
static int errmgr_app_close(void);
|
||||
static int errmgr_app_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_app_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itapp
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"app",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_app_open,
|
||||
errmgr_app_close,
|
||||
errmgr_app_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
5
|
||||
};
|
||||
|
||||
static int errmgr_app_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_app_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_app_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_APP) {
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 5;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_app_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -11,6 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -100,13 +102,11 @@ void orte_errmgr_predicted_proc_construct(orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
}
|
||||
|
||||
void orte_errmgr_predicted_proc_destruct( orte_errmgr_predicted_proc_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
}
|
||||
|
||||
@ -142,13 +142,11 @@ OBJ_CLASS_INSTANCE(orte_errmgr_predicted_map_t,
|
||||
void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->node_name = NULL;
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_MIN);
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
item->map_node_name = NULL;
|
||||
@ -159,7 +157,6 @@ void orte_errmgr_predicted_map_construct(orte_errmgr_predicted_map_t *item)
|
||||
void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
{
|
||||
item->proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(item->proc_name.epoch,ORTE_EPOCH_INVALID);
|
||||
item->proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->node_name ) {
|
||||
@ -168,7 +165,6 @@ void orte_errmgr_predicted_map_destruct( orte_errmgr_predicted_map_t *item)
|
||||
}
|
||||
|
||||
item->map_proc_name.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(item->map_proc_name.epoch,ORTE_EPOCH_INVALID);
|
||||
item->map_proc_name.jobid = ORTE_JOBID_INVALID;
|
||||
|
||||
if( NULL != item->map_node_name ) {
|
||||
@ -200,17 +196,9 @@ void orte_errmgr_base_log(int error_code, char *filename, int line)
|
||||
return;
|
||||
}
|
||||
|
||||
if (NULL != orte_process_info.job_name) {
|
||||
opal_output(0, "[[%s][%s][%s][%d]] ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
orte_process_info.job_name,
|
||||
(NULL == orte_process_info.job_instance) ? "NULL" : orte_process_info.job_instance,
|
||||
(NULL == orte_process_info.executable) ? "NULL" : orte_process_info.executable,
|
||||
orte_process_info.app_rank, errstring, filename, line);
|
||||
} else {
|
||||
opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
errstring, filename, line);
|
||||
}
|
||||
opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
errstring, filename, line);
|
||||
}
|
||||
|
||||
#if WANT_PMI_SUPPORT
|
||||
@ -290,19 +278,6 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
/* No way to reach here */
|
||||
}
|
||||
|
||||
int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
/*
|
||||
* This is a stub function that is only meant to be called by tools,
|
||||
* so it will always return success.
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
||||
{
|
||||
/* stub function - ignore */
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -64,7 +66,6 @@ orte_errmgr_base_module_t orte_errmgr_default_fns = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_base_abort_peers,
|
||||
orte_errmgr_base_update_state,
|
||||
NULL, /* predicted_fault */
|
||||
NULL, /* suggest_map_targets */
|
||||
NULL, /* ft_event */
|
||||
@ -83,8 +84,6 @@ orte_errmgr_base_module_t orte_errmgr = {
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -267,7 +267,6 @@ static int errmgr_base_tool_start_cmdline_listener(void)
|
||||
*/
|
||||
errmgr_cmdline_sender.jobid = ORTE_JOBID_INVALID;
|
||||
errmgr_cmdline_sender.vpid = ORTE_VPID_INVALID;
|
||||
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,ORTE_EPOCH_MIN);
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
ORTE_RML_TAG_MIGRATE,
|
||||
0,
|
||||
@ -379,14 +378,12 @@ static void errmgr_base_tool_cmdline_process_recv(int fd, short event, void *cbd
|
||||
if( OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, &errmgr_cmdline_sender) ) {
|
||||
swap_dest.jobid = errmgr_cmdline_sender.jobid;
|
||||
swap_dest.vpid = errmgr_cmdline_sender.vpid;
|
||||
ORTE_EPOCH_SET(swap_dest.epoch,errmgr_cmdline_sender.epoch);
|
||||
|
||||
errmgr_cmdline_sender = *sender;
|
||||
orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERR_INPROGRESS);
|
||||
|
||||
errmgr_cmdline_sender.jobid = swap_dest.jobid;
|
||||
errmgr_cmdline_sender.vpid = swap_dest.vpid;
|
||||
ORTE_EPOCH_SET(errmgr_cmdline_sender.epoch,swap_dest.epoch);
|
||||
|
||||
goto cleanup;
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -69,18 +71,10 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_fns;
|
||||
ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
__opal_attribute_noreturn__;
|
||||
__opal_attribute_format__(__printf__, 2, 3);
|
||||
ORTE_DECLSPEC int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -13,7 +13,7 @@
|
||||
AC_DEFUN([MCA_orte_errmgr_default_app_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/errmgr/default_app/Makefile])
|
||||
|
||||
AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0],
|
||||
AS_IF([test "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
])
|
||||
|
@ -7,7 +7,8 @@
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -34,6 +35,7 @@
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
@ -45,13 +47,6 @@
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
@ -64,7 +59,6 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
abort_peers,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
@ -72,11 +66,16 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
NULL
|
||||
};
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
/* setup state machine to trap proc errors */
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -85,43 +84,43 @@ static int finalize(void)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:default_app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
"%s errmgr:default_app: proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
ORTE_NAME_PRINT(&caddy->name),
|
||||
orte_proc_state_to_str(caddy->proc_state)));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == caddy->proc_state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &caddy->name)) {
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* see is this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(&caddy->name)) {
|
||||
/* order an exit */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_UNRECOVERABLE);
|
||||
OBJ_RELEASE(caddy);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
|
@ -13,7 +13,7 @@
|
||||
AC_DEFUN([MCA_orte_errmgr_default_hnp_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/errmgr/default_hnp/Makefile])
|
||||
|
||||
AS_IF([test "$orte_enable_resilient_code" = 0 -a "$orte_without_full_support" = 0],
|
||||
AS_IF([test "$orte_without_full_support" = 0],
|
||||
[$1],
|
||||
[$2])
|
||||
])
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче
Block a user